In [1]:
import pandas as pd
import re
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('../datasets/IndianFoodDatasetCSV.csv')


# Print the first few rows of the DataFrame


In [2]:
removed = {}
def string_cleaner(string: str):
    # remove non-alphabets
    clean_string = re.sub(r'[^a-zA-Z\s]', '', string).lower()

    # remove units
    units = ["teaspoons","cups","powder","tablespoons","tablespoon","grams", "pieces", "few",
             "g", "as", "make", "use", "grind", "per", "taste", "to", "or", "of" ,"cup","inch"
             ,"teaspoon","gram","tsp","tbsp","ml","kg","ssp","gm","pinch", "handful"]

    for unit in units:
        raw_unit = r"\b{unit}\b".format(unit=unit)
        clean_string = re.sub(raw_unit,"", clean_string)
    return clean_string.strip()

def low_freq_filter(word_map):

    
    for word in word_map:
        if word_map[word] < 15:
            removed[word] = 1
    for i in removed:
        
        word_map.pop(i)
    return word_map

def filter_ingredients(dataframe):

    words = []
    for ings in dataframe.TranslatedIngredients.str.split(','):
        if isinstance(ings, float):
            continue

        for ing in ings:
            full_ing = (ing.strip().split(' - '))
            dash_count = len(full_ing)

            if dash_count > 2:
                words.append(string_cleaner(full_ing[0]))
                words.append(string_cleaner(full_ing[1]))
            else:
                words.append(string_cleaner(full_ing[0]))

    word_map = {}
    words = [word for word in words if word != '']

    for word in words:
        word_map[word] = word_map.get(word, 0) + 1

    word_map = low_freq_filter(word_map)
    return word_map

filtered_ingredients = filter_ingredients(df)
sorted_ingredients = sorted(filtered_ingredients.items(), key= lambda x: x[1], reverse=True)

# print(sorted_ingredients)
# l = []
# for name, freq in sorted_ingredients:
#     l.append({"name": name, "frequency": freq})
# print(removed)
# for item in l:
#     print(item, end=",\n")



In [3]:
count = 0
ingredients = []

for ings in df.TranslatedIngredients.str.split(','):
    words = []
    
    # Ingredients missing - so add an empty array in its place
    if isinstance(ings, float):
        ingredients.append([])
        continue

    for ing in ings:
        full_ing = (ing.strip().split(' - '))
        dash_count = len(full_ing)

        ing1 = string_cleaner(full_ing[0])
        
        if dash_count > 2:
            ing2 = string_cleaner(full_ing[1])
            if ing1: words.append(ing1)
            if ing2: words.append(ing2)
        else:
            if ing1: words.append(ing1)

    # replace ingredients which are <15 frequency by category "other"
    final_ingredients = []
    check = False

    for word in words:
        if removed.get(word, 0):
            # the current dish contains an ingredient that falls in "other" category.
            check=True
        else:
            final_ingredients.append(word)
    # if check: final_ingredients.append('other')

    ingredients.append(final_ingredients)

In [4]:
df['FilteredIngredients'] = ingredients

In [5]:
# remove unnecessary columns
new_df = df
new_df = new_df.drop('Srno', axis=1)
new_df = new_df.drop('RecipeName', axis=1)
new_df = new_df.drop('Instructions', axis=1)
new_df = new_df.drop('Ingredients', axis=1)


# drop rows where the 'ingredients' column has an empty list
new_df = new_df[new_df['FilteredIngredients'].apply(lambda x: len(x) > 0)]

# Problematic Row
new_df = new_df[new_df['TranslatedRecipeName'] != 'Shahi Vegetable Pulao Recipe - Shahi Vegetable Pulao']


# rename columns
new_df.rename(columns={
    'TranslatedRecipeName': 'RecipeName',
    'TranslatedIngredients': 'FullIngredients',
    'TranslatedInstructions': 'Instructions',
    'FilteredIngredients': 'Ingredients'}, inplace=True)

# reposition columns
new_df = new_df.reindex(columns=['RecipeName','Ingredients', 'FullIngredients', 'PrepTimeInMins',
       'CookTimeInMins', 'TotalTimeInMins', 'Servings', 'Cuisine', 'Course',
       'Diet', 'Instructions', 'URL'])

In [6]:
new_df['URL']

0       https://www.archanaskitchen.com/masala-karela-...
1       http://www.archanaskitchen.com/spicy-tomato-ri...
2       http://www.archanaskitchen.com/ragi-vermicelli...
3       http://www.archanaskitchen.com/gongura-chicken...
4       https://www.archanaskitchen.com/andhra-style-a...
                              ...                        
6863    https://www.archanaskitchen.com/saffron-paneer...
6864    http://www.archanaskitchen.com/italian-arancin...
6865    https://www.archanaskitchen.com/quinoa-phirnee...
6868    https://www.archanaskitchen.com/ullikadala-pul...
6869    http://www.archanaskitchen.com/kashmiri-kokur-...
Name: URL, Length: 6304, dtype: object

In [14]:
"""16 Minute runtime - fetches image urls"""


# import requests
# from bs4 import BeautifulSoup

# urls = []
# count = 0
# for url in new_df['URL']:
#     if count % 500 == 0: 
#         print(count)
#     count+=1
#     try:
#         # Send a GET request to the webpage URL
#         response = requests.get(url)

#         # Parse the HTML content using BeautifulSoup
#         soup = BeautifulSoup(response.content, 'html.parser')

#         # Find the image tag and extract the source URL
#         image_tag = soup.find('img', {'class': 'img-fluid img-thumbnail'})
#         if image_tag is not None:
#             image_url = image_tag['src']
#             image_url = "http://www.archanaskitchen.com/" + image_url
#             urls.append(image_url)
#         else:
#             urls.append(None)
#     except:
#         urls.append(None)

In [10]:
new_df['Image_URL'] = urls

In [12]:
print(new_df['Image_URL'].tolist())

['http://www.archanaskitchen.com//images/archanaskitchen/1-Author/Pooja_Thakur/Karela_Masala_Recipe-4_1600.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/1-Author/b.yojana-gmail.com/Spicy_Thakkali_Rice_Tomato_Pulihora-1_edited.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/1-Author/Monika_Manchanda/Ragi_vermicilli.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/Ghongura_Chicken_Curry_Recipe-2_1600.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/1-Author/sibyl_sunitha/Andhra_Style_Allam_Pachadi_Ginger_Chutney_Recipe_.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/1-Author/nithya.anantham/Mint_Khara_Pongal_Recipe.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/1-Author/sibyl_sunitha/Udupi_Style_Ash_Gourd_Coconut_Curry_Recipe_.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/1-Author/Madhuri_Aggarwal/Black_Bean_Burrito.jpg', 'http://www.archanaskitchen.com//images/archanaskitchen/Guest_Writ

In [13]:
# new_df.to_csv('../datasets/data.csv', index=False)

In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


for col in ['Diet', 'Course', 'Cuisine']:

    # initialize OneHotEncoder
    encoder = OneHotEncoder()

    # fit and transform the 'fruit' column
    onehot = encoder.fit_transform(new_df[[col]]).toarray()

    # create columns with unique values
    cols = sorted(list(set(new_df[col])))
    new_df[cols] = onehot

    # drop the original 'fruit' column
    new_df = new_df.drop(col, axis=1)

part1_df = new_df



In [8]:
# One-hot encoding the ingredients

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

part2_df = pd.DataFrame(mlb.fit_transform(new_df['Ingredients']),columns=mlb.classes_, index=new_df.index)


In [9]:
# print(transformed_df.shape, ing_df.shape)

final_df = pd.concat([part1_df, part2_df], axis=1, join='outer')
final_df.shape


(6304, 527)

In [10]:
final_df = final_df.drop('RecipeName', axis=1)
final_df = final_df.drop('Ingredients', axis=1)
final_df = final_df.drop('FullIngredients', axis=1)
final_df = final_df.drop('PrepTimeInMins', axis=1)
final_df = final_df.drop('CookTimeInMins', axis=1)
final_df = final_df.drop('Instructions', axis=1)
final_df = final_df.drop('URL', axis=1)


# normalize prep time , cook time, total time, serving size 

# final_df.rename(columns={

# 'remainder__PrepTimeInMins':'PrepTimeInMins',
# 'remainder__CookTimeInMins':'CookTimeInMins',
# 'remainder__TotalTimeInMins':'TotalTimeInMins',
# 'remainder__Servings':'Servings',

# }, inplace=True)
  
for col in ['Servings', 'TotalTimeInMins']:
    
    final_df[col] = MinMaxScaler().fit_transform(np.array(final_df[col]).reshape(-1,1))

print(final_df.shape)



(6304, 520)


In [11]:
print(final_df.shape)

(6304, 520)


In [12]:
final_df.to_csv('../datasets/encoded.csv', index=False)