In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Download NLTK and spaCy models
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import spacy
spacy.cli.download("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Import libraries
!pip install nltk spacy datasketch unidecode simhash

import pandas as pd
import re
import numpy as np
from unidecode import unidecode
from simhash import Simhash
from datasketch import MinHash, MinHashLSH

# Load spaCy
import spacy
nlp = spacy.load("en_core_web_sm")


Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting simhash
  Downloading simhash-2.1.2-py3-none-any.whl.metadata (382 bytes)
Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading simhash-2.1.2-py3-none-any.whl (4.7 kB)
Installing collected packages: unidecode, simhash, datasketch
Successfully installed datasketch-1.6.5 simhash-2.1.2 unidecode-1.4.0


In [None]:
# Load Data from Gdrive
file_path = "/content/drive/MyDrive/NLP - Group Project/recipe_dataset.csv"

# Load the dataset
df = pd.read_csv(file_path)
print("Original shape:", df.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/NLP - Group Project/recipe_dataset.csv'

In [None]:
# Sample data
df_sample = df.sample(1000000, random_state=42).copy()
print("Sample shape:", df_sample.shape)

Sample shape: (1000000, 7)


In [None]:
# Fill missing value
df_sample['ingredients'] = df_sample['ingredients'].fillna("").astype(str)
df_sample['directions'] = df_sample['directions'].fillna("").astype(str)

In [None]:
# Text cleaning
def simple_clean(text):
    text = text.lower()
    text = unidecode(text)
    text = re.sub(r'[^a-z\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

df_sample['clean_ingredients'] = df_sample['ingredients'].apply(simple_clean)
df_sample['clean_directions'] = df_sample['directions'].apply(simple_clean)
df_sample['clean_text'] = df_sample['clean_ingredients'] + ' ' + df_sample['clean_directions']

SimHash Deduplication

In [None]:
def simhash_text(text, max_words=100):
    tokens = text.split()[:max_words]
    return Simhash(' '.join(tokens)).value

df_sample['simhash'] = df_sample['clean_text'].apply(simhash_text)
df_sample = df_sample.drop_duplicates(subset='simhash').reset_index(drop=True)
print("After SimHash Deduplication Deduplication:", df_sample.shape)


After SimHash Deduplication Deduplication: (999826, 11)


MinHash Deduplication

In [None]:
def minhash_signature(text, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for word in text.split()[:100]:
        m.update(word.encode('utf8'))
    return m

lsh = MinHashLSH(threshold=0.9, num_perm=128)
minhashes = {}

for i, row in df_sample.iterrows():
    m = minhash_signature(row['clean_text'])
    minhashes[i] = m
    lsh.insert(i, m)

to_drop = set()
for i in minhashes:
    results = lsh.query(minhashes[i])
    for r in results:
        if i != r:
            to_drop.add(r)

df_sample = df_sample.drop(index=to_drop).reset_index(drop=True)
print("After MinHash Deduplication:", df_sample.shape)

After MinHash Deduplication: (996675, 11)


Save to CSV

In [None]:
# Save deduplicated data after SimHash and MinHash
output_path = "/content/drive/MyDrive/NLP - Group Project/preprocessed_recipes.csv"
df_sample.to_csv(output_path, index=False)
print("✅ Saved cleaned dataset to:", output_path)

✅ Saved cleaned dataset to: /content/drive/MyDrive/NLP - Group Project/preprocessed_recipes.csv


In [None]:
df_sample.head(10)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,clean_ingredients,clean_directions,clean_text,simhash
0,2015528,Marinated Flank Steak Recipe,"[""1 1/2 pound flank steak"", ""1/2 c. finely min...","[""Remove tenderloin from steak."", ""Score meat....",cookeatshare.com/recipes/marinated-flank-steak...,Recipes1M,"[""flank steak"", ""green onions"", ""red wine"", ""s...",pound flank steak c finely minced green onions...,remove tenderloin from steak score meat combin...,pound flank steak c finely minced green onions...,2984177662168074987
1,1608734,French Chicken Stew,"[""1 tablespoon rosemary"", ""1 teaspoon thyme"", ...","[""combine all ingredients in slow cooker (6 qu...",www.yummly.com/recipe/French-Chicken-Stew-1433580,Gathered,"[""rosemary"", ""thyme"", ""bay leaves"", ""paprika"",...",tablespoon rosemary teaspoon thyme bay leaves ...,combine all ingredients in slow cooker quarts ...,tablespoon rosemary teaspoon thyme bay leaves ...,16962350100183742163
2,778500,Glazed Carrots,"[""3 to 4 carrots"", ""1 1/2 Tbsp. butter"", ""1/3 ...","[""Cook 3 to 4 carrots; cut crosswise in 1-inch...",www.cookbooks.com/Recipe-Details.aspx?id=1011892,Gathered,"[""carrots"", ""butter"", ""brown sugar"", ""lemon ri...",to carrots tbsp butter c brown sugar grated le...,cook to carrots cut crosswise in inch pieces a...,to carrots tbsp butter c brown sugar grated le...,13631088686507211408
3,1334975,Moms Pie Dough,"[""4.5 Cups Flour"", ""1.5 Tsp Salt"", ""Pinch Baki...","[""Mix all dry ingredients in a bowl."", """", ""Ad...",www.epicurious.com/recipes/member/views/moms-p...,Gathered,"[""Flour"", ""Salt"", ""Baking Powder"", ""Sugar"", ""C...",cups flour tsp salt pinch baking powder tbls s...,mix all dry ingredients in a bowl add crisco a...,cups flour tsp salt pinch baking powder tbls s...,9758474554084259533
4,116562,Pretzel Salad Or Dessert,"[""2 c. crushed small thin pretzels (sticks)"", ...","[""Mix and press in baking pan, approximately 1...",www.cookbooks.com/Recipe-Details.aspx?id=106723,Gathered,"[""thin pretzels"", ""margarine""]",c crushed small thin pretzels sticks c margarine,mix and press in baking pan approximately x in...,c crushed small thin pretzels sticks c margari...,12570825253960956135
5,1712896,Citrus Syrup,"[""3/4 cup sugar"", ""1/2 cup fresh orange juice""...","[""In a 1 1/2-quart saucepan stir together suga...",www.epicurious.com/recipes/food/views/citrus-s...,Recipes1M,"[""sugar"", ""orange juice"", ""lemon juice""]",cup sugar cup fresh orange juice cup fresh lem...,in a quart saucepan stir together sugar and ju...,cup sugar cup fresh orange juice cup fresh lem...,3144162169882007132
6,1306450,Cranberry And Candied Orange Chutney,"[""1 large navel orange with skin"", ""7 cups wat...","[""Cut orange into 1/4-inch-thick rounds; cut r...",www.epicurious.com/recipes/food/views/cranberr...,Gathered,"[""orange with skin"", ""water"", ""sugar"", ""cinnam...",large navel orange with skin cups water divide...,cut orange into inchthick rounds cut rounds in...,large navel orange with skin cups water divide...,16925774012631188822
7,1345812,Tau Kua He Ci Medan'S Favourite Food,"[""1 slices Gravy ingredients (A) - onion"", ""3 ...","[""The condiments:"", ""- Large prawns, fried in ...",www.epicurious.com/recipes/member/views/tau-ku...,Gathered,"[""Gravy ingredients"", ""garlic"", ""Gravy ingredi...",slices gravy ingredients a onion cloves gravy ...,the condiments large prawns fried in batter sm...,slices gravy ingredients a onion cloves gravy ...,3885540940456866890
8,692271,Jamaica Barbecue Sauce,"[""1 1/2 c. cider vinegar"", ""4 tsp. lemon juice...","[""Mix ingredients well."", ""Pour into jar."", ""K...",www.cookbooks.com/Recipe-Details.aspx?id=470060,Gathered,"[""cider vinegar"", ""lemon juice"", ""Worcestershi...",c cider vinegar tsp lemon juice tsp worcesters...,mix ingredients well pour into jar keep in ref...,c cider vinegar tsp lemon juice tsp worcesters...,1064850389558571562
9,633422,Dill Dip,"[""2/3 c. sour cream"", ""2/3 c. Hellmann's mayo""...","[""Mix all ingredients together and chill overn...",www.cookbooks.com/Recipe-Details.aspx?id=109385,Gathered,"[""sour cream"", ""mayo"", ""parsley flakes"", ""onio...",c sour cream c hellmanns mayo tbsp parsley fla...,mix all ingredients together and chill overnig...,c sour cream c hellmanns mayo tbsp parsley fla...,13158171514425549016
