## PDF Scraper
#### The purpose of this project is to scrape the ingredients in the Stealth Health cookbook to get a list of ingredients, and then run text clustering to see which recipies are most similar in ingredients. 

In [1]:
#import packages
from pypdf import PdfReader
import pandas as pd

#### Scrape PDF Information

In [2]:
# importing pdf file
reader = PdfReader('StealthHealth.pdf')
print(len(reader.pages))

84


In [3]:
def extract_text_between(pdf_path, start_word, end_word):
    reader = PdfReader(pdf_path)
    data = []

    for page in reader.pages[6:84]: 
        text = page.extract_text()
        title = text.split('\n')[0] 
        start_index = text.find(start_word) + len(start_word)
        end_index = text.find(end_word, start_index)

        if start_index != -1 and end_index != -1:
            content = text[start_index:end_index].strip()
            for line in content.split('\n'):
                data.append([title, line.strip()])

    return pd.DataFrame(data, columns=["Title", "Content"])

pdf_file = "StealthHealth.pdf"
start_word = "INGREDIENTS"
end_word = "INSTRUCTIONS"

result_df = extract_text_between(pdf_file, start_word, end_word)

if not result_df.empty:
    result_df
else:
    print("No matching text found between the specified words.")

In [4]:
pd.set_option('display.max_rows', None) 
result_df

Unnamed: 0,Title,Content
0,Chicken Avocado Sandwiches,2 brioche buns
1,Chicken Avocado Sandwiches,"2, 5oz boneless skinless chicken thighs"
2,Chicken Avocado Sandwiches,"Season with: salt, garlic, onion, smoked"
3,Chicken Avocado Sandwiches,"paprika, chipotle chili powder"
4,Chicken Avocado Sandwiches,1 medium avocado (~120g)
5,Chicken Avocado Sandwiches,50g green enchilada sauce/salsa verde
6,Chicken Avocado Sandwiches,20g shredded provolone (10g each)
7,Chicken Avocado Sandwiches,Hot sauce of choice
8,"""Lazy"" Hot Chicken Sandwiches","2, 5oz boneless skinless chicken thighs,"
9,"""Lazy"" Hot Chicken Sandwiches","seasoned with: salt, garlic, onion,"


In [5]:
# remove numbers in front of strings in the column, and remove anything in parenthesis
import re
def clean_text(text):
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\S*\d+\S*', '', text)
    text = re.sub(r'\b(?:tsp|cup|tbspn|tblspn|lb|lbs|tspn|tablespoon)\b\S*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[,+*]', '', text)
    text = text.lower()
    return text

result_df['Clean'] = result_df['Content'].apply(clean_text)


In [6]:
result_df

Unnamed: 0,Title,Content,Clean
0,Chicken Avocado Sandwiches,2 brioche buns,brioche buns
1,Chicken Avocado Sandwiches,"2, 5oz boneless skinless chicken thighs",boneless skinless chicken thighs
2,Chicken Avocado Sandwiches,"Season with: salt, garlic, onion, smoked",season with: salt garlic onion smoked
3,Chicken Avocado Sandwiches,"paprika, chipotle chili powder",paprika chipotle chili powder
4,Chicken Avocado Sandwiches,1 medium avocado (~120g),medium avocado
5,Chicken Avocado Sandwiches,50g green enchilada sauce/salsa verde,green enchilada sauce/salsa verde
6,Chicken Avocado Sandwiches,20g shredded provolone (10g each),shredded provolone
7,Chicken Avocado Sandwiches,Hot sauce of choice,hot sauce of choice
8,"""Lazy"" Hot Chicken Sandwiches","2, 5oz boneless skinless chicken thighs,",boneless skinless chicken thighs
9,"""Lazy"" Hot Chicken Sandwiches","seasoned with: salt, garlic, onion,",seasoned with: salt garlic onion


In [7]:
# view group by list
grouped = result_df.groupby('Clean')['Title']

In [8]:
grouped = result_df.groupby('Clean')['Title'].apply(list).reset_index()

In [9]:
grouped_titles = result_df.groupby('Clean')['Title'].apply(list).reset_index()

# Get the value counts for each 'Clean' group
value_counts = result_df['Clean'].value_counts().reset_index()
value_counts.columns = ['Clean', 'Count']
final_result = pd.merge(grouped_titles, value_counts, on='Clean')
final_result.sort_values('Count', ascending=False)

Unnamed: 0,Clean,Title,Count
38,boneless skinless chicken thighs,"[Chicken Avocado Sandwiches, ""Lazy"" Hot Chicke...",22
266,parmigiano reggiano,"[""Lazy"" Chicken Parmesan Sandwiches, Meatball ...",14
90,cottage cheese,"[Flatbread Pepperoni Pizza, ""Lazy"" Personal Pi...",14
224,milk,"[Meatball Sub ""Bread Boat"", Buffalo Chicken & ...",12
59,cheddar,"[Air Fryer Creamy Chicken Quesadillas, Beef & ...",11
174,honey,"[Buffalo Chicken Melt, Buffalo Ranch Chicken S...",10
26,blended cottage cheese,"[Chicken Parmesan Wraps, Meatball Sub ""Bread B...",10
45,buffalo sauce,"[Buffalo Chicken Melt, Buffalo Ranch Chicken S...",9
154,greek yogurt,"[Chicken Bacon Ranch Quesadilla, Beef & Cheese...",9
61,cheddar powder,"[Classic Mac n Cheese, Buffalo Chicken Mac n C...",8
