## Import libraries 

In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer as norm, StandardScaler as std, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [40]:
df = pd.read_csv("data/indian_food.csv")
df.head(10)

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East
5,Imarti,"Sugar syrup, lentil flour",vegetarian,10,50,sweet,dessert,West Bengal,East
6,Jalebi,"Maida, corn flour, baking soda, vinegar, curd,...",vegetarian,10,50,sweet,dessert,Uttar Pradesh,North
7,Kaju katli,"Cashews, ghee, cardamom, sugar",vegetarian,10,20,sweet,dessert,-1,-1
8,Kalakand,"Milk, cottage cheese, sugar",vegetarian,20,30,sweet,dessert,West Bengal,East
9,Kheer,"Milk, rice, sugar, dried fruits",vegetarian,10,40,sweet,dessert,-1,-1


## check null values

In [41]:
df.isna().sum()

name              0
ingredients       0
diet              0
prep_time         0
cook_time         0
flavor_profile    0
course            0
state             0
region            1
dtype: int64

Removing the row where null is present in region because it wont make sense if we fill it with mode or median or mean 

In [42]:
df = df.dropna()
df.isna().sum()

name              0
ingredients       0
diet              0
prep_time         0
cook_time         0
flavor_profile    0
course            0
state             0
region            0
dtype: int64

We have -1 in many places instead of null so we should clean it

In [43]:
#find all columns with value -1
df.columns[df.isin([-1]).any()]

Index(['prep_time', 'cook_time'], dtype='object')

In [44]:
#replace all -1 with mean of the column
df["prep_time"] = df["prep_time"].replace(-1, df["prep_time"].mean())
df["cook_time"] = df["cook_time"].replace(-1, df["cook_time"].mean())

In [45]:
df.columns[df.isin([-1]).any()]

Index([], dtype='object')

In [46]:
df.columns[df.isin(["-1"]).any()]

Index(['flavor_profile', 'state', 'region'], dtype='object')

In [47]:
df["state"] = df["state"].replace("-1","unknown")
df["region"] = df["region"].replace("-1","unknown")

In [48]:
df.columns[df.isin(["-1"]).any()]

Index(['flavor_profile'], dtype='object')

In [49]:
items = df.loc[df["flavor_profile"]=="-1"]["name"].unique()
items

array(['Chapati', 'Naan', 'Rongi', 'Kanji', 'Pachadi', 'Paniyaram',
       'Paruppu sadam', 'Puli sadam', 'Puttu', 'Sandige', 'Sevai',
       'Thayir sadam', 'Theeyal', 'Bhakri', 'Copra paak', 'Dahi vada',
       'Dalithoy', 'Kansar', 'Farsi Puri', 'Khar', 'Luchi',
       'Bengena Pitika', 'Bilahi Maas', 'Black rice', 'Brown Rice',
       'Chingri Bhape', 'Pakhala', 'Pani Pitha', 'Red Rice'], dtype=object)

In [50]:
df["flavor_profile"].unique()

array(['sweet', 'spicy', 'bitter', '-1', 'sour'], dtype=object)

### Generating code for replacing flavour_profile with

In [51]:
s=""
template = "df.loc[df['name']=='{}','flavor_profile'] = df.loc[df['name']=='{}','flavor_profile'].replace('-1','')"
for i in items:
    s+=template.format(i,i)+"\n"
print(s)


df.loc[df['name']=='Chapati','flavor_profile'] = df.loc[df['name']=='Chapati','flavor_profile'].replace('-1','')
df.loc[df['name']=='Naan','flavor_profile'] = df.loc[df['name']=='Naan','flavor_profile'].replace('-1','')
df.loc[df['name']=='Rongi','flavor_profile'] = df.loc[df['name']=='Rongi','flavor_profile'].replace('-1','')
df.loc[df['name']=='Kanji','flavor_profile'] = df.loc[df['name']=='Kanji','flavor_profile'].replace('-1','')
df.loc[df['name']=='Pachadi','flavor_profile'] = df.loc[df['name']=='Pachadi','flavor_profile'].replace('-1','')
df.loc[df['name']=='Paniyaram','flavor_profile'] = df.loc[df['name']=='Paniyaram','flavor_profile'].replace('-1','')
df.loc[df['name']=='Paruppu sadam','flavor_profile'] = df.loc[df['name']=='Paruppu sadam','flavor_profile'].replace('-1','')
df.loc[df['name']=='Puli sadam','flavor_profile'] = df.loc[df['name']=='Puli sadam','flavor_profile'].replace('-1','')
df.loc[df['name']=='Puttu','flavor_profile'] = df.loc[df['name']=='Puttu','flavor_profil

In [52]:
df.loc[df['name']=='Chapati', 'flavor_profile'] = df.loc[df['name']=='Chapati', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Naan', 'flavor_profile'] = df.loc[df['name']=='Naan', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Rongi', 'flavor_profile'] = df.loc[df['name']=='Rongi', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Kanji', 'flavor_profile'] = df.loc[df['name']=='Kanji', 'flavor_profile'].replace('-1','sour')
df.loc[df['name']=='Pachadi', 'flavor_profile'] = df.loc[df['name']=='Pachadi', 'flavor_profile'].replace('-1','sweet')
df.loc[df['name']=='Paniyaram', 'flavor_profile'] = df.loc[df['name']=='Paniyaram', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Paruppu sadam', 'flavor_profile'] = df.loc[df['name']=='Paruppu sadam', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Puli sadam', 'flavor_profile'] = df.loc[df['name']=='Puli sadam', 'flavor_profile'].replace('-1','sour')
df.loc[df['name']=='Puttu', 'flavor_profile'] = df.loc[df['name']=='Puttu', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Sandige', 'flavor_profile'] = df.loc[df['name']=='Sandige', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Sevai', 'flavor_profile'] = df.loc[df['name']=='Sevai', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Thayir sadam', 'flavor_profile'] = df.loc[df['name']=='Thayir sadam', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Theeyal', 'flavor_profile'] = df.loc[df['name']=='Theeyal', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Bhakri', 'flavor_profile'] = df.loc[df['name']=='Bhakri', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Copra paak', 'flavor_profile'] = df.loc[df['name']=='Copra paak', 'flavor_profile'].replace('-1','sweet')
df.loc[df['name']=='Dahi vada', 'flavor_profile'] = df.loc[df['name']=='Dahi vada', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Dalithoy', 'flavor_profile'] = df.loc[df['name']=='Dalithoy', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Kansar', 'flavor_profile'] = df.loc[df['name']=='Kansar', 'flavor_profile'].replace('-1','sweet')
df.loc[df['name']=='Farsi Puri', 'flavor_profile'] = df.loc[df['name']=='Farsi Puri', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Khar', 'flavor_profile'] = df.loc[df['name']=='Khar', 'flavor_profile'].replace('-1','sweet')
df.loc[df['name']=='Luchi', 'flavor_profile'] = df.loc[df['name']=='Luchi', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Bengena Pitika', 'flavor_profile'] = df.loc[df['name']=='Bengena Pitika', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Bilahi Maas', 'flavor_profile'] = df.loc[df['name']=='Bilahi Maas', 'flavor_profile'].replace('-1','sour')
df.loc[df['name']=='Black rice', 'flavor_profile'] = df.loc[df['name']=='Black rice', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Brown Rice', 'flavor_profile'] = df.loc[df['name']=='Brown Rice', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Chingri Bhape', 'flavor_profile'] = df.loc[df['name']=='Chingri Bhape', 'flavor_profile'].replace('-1','spicy')
df.loc[df['name']=='Pakhala', 'flavor_profile'] = df.loc[df['name']=='Pakhala', 'flavor_profile'].replace('-1','')
df.loc[df['name']=='Pani Pitha', 'flavor_profile'] = df.loc[df['name']=='Pani Pitha', 'flavor_profile'].replace('-1','sweet')
df.loc[df['name']=='Red Rice', 'flavor_profile'] = df.loc[df['name']=='Red Rice', 'flavor_profile'].replace('-1','')



In [53]:
df.isin(["-1"]).any()

name              False
ingredients       False
diet              False
prep_time         False
cook_time         False
flavor_profile    False
course            False
state             False
region            False
dtype: bool

In [54]:
df.dtypes

name               object
ingredients        object
diet               object
prep_time         float64
cook_time         float64
flavor_profile     object
course             object
state              object
region             object
dtype: object

In [55]:
Y = df["flavor_profile"]
X = df.drop(["flavor_profile"], axis=1)

In [57]:
from sklearn.compose import ColumnTransformer

le = LabelEncoder()
Y = le.fit_transform(Y)

col = [0,1,2,5,6,7]
ct = ColumnTransformer(transformers = [("enc", OneHotEncoder(), [0,1,2,5,6,7])], remainder = "passthrough")
X_enc = ct.fit_transform(X)

X_enc = pd.DataFrame(X_enc.toarray())

enc_col_names = ct.named_transformers_["enc"].get_feature_names_out()
col_names = list(enc_col_names) + list(X.columns.drop(X.columns[col]))
X_enc.columns = col_names


In [58]:
X_enc

Unnamed: 0,name_Adhirasam,name_Aloo gobi,name_Aloo matar,name_Aloo methi,name_Aloo shimla mirch,name_Aloo tikki,name_Alu Pitika,name_Amti,name_Anarsa,name_Ariselu,...,state_unknown,region_Central,region_East,region_North,region_North East,region_South,region_West,region_unknown,prep_time,cook_time
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,45.000000,25.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,80.000000,30.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,15.000000,60.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,15.000000,30.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,15.000000,40.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.000000,30.000000
250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20.000000,60.000000
251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,31.188976,34.566929
252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,20.000000,45.000000


In [25]:
Y

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
       4, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 3, 3, 3, 3, 1, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 3, 3, 4, 3, 3, 3, 2, 3,
       3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 3, 3, 3, 3, 4,
       3, 3, 0, 3, 3, 4, 3, 4, 3, 3, 4, 3, 4, 2, 3, 3, 3, 3, 3, 1, 4, 1,
       3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 4, 4,
       4, 4, 3, 0, 3, 3, 3, 2, 0, 3, 0, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       4, 0, 4, 4, 3, 0, 3, 4, 4, 4, 4, 4], dtype=int64)

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_enc, Y, test_size=0.2, random_state=0)

In [27]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,535,536,537,538,539,540,541,542,543,544
162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,35.0
159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,15.0,20.0
76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,35.0
59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,25.0
214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,15.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,15.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,25.0
117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,20.0
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,25.0


In [28]:
y_train

array([3, 3, 3, 4, 3, 3, 4, 3, 4, 2, 3, 4, 3, 3, 0, 4, 3, 0, 2, 4, 3, 3,
       4, 4, 3, 3, 3, 4, 3, 4, 3, 4, 4, 4, 4, 3, 3, 4, 4, 4, 3, 4, 4, 3,
       4, 4, 3, 3, 4, 3, 3, 4, 3, 3, 4, 4, 4, 0, 3, 3, 4, 4, 4, 3, 4, 4,
       4, 4, 3, 4, 3, 4, 3, 4, 3, 4, 0, 4, 3, 3, 2, 3, 3, 4, 3, 3, 3, 3,
       4, 3, 3, 1, 3, 0, 3, 4, 3, 3, 3, 4, 4, 4, 4, 3, 4, 0, 3, 0, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 4, 4, 3, 4, 4, 4, 3, 4,
       3, 3, 4, 3, 3, 4, 4, 4, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3,
       3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 4, 3, 1, 3, 3, 4, 3, 3, 3,
       4, 3, 3, 3, 3, 3, 4, 0, 3, 4, 3, 3, 3, 3, 3, 4, 4, 4, 3, 4, 3, 1,
       3, 3, 3, 4, 3], dtype=int64)

In [59]:
X_enc["flavour_profile"] = Y

In [60]:
X_enc.to_csv("data/indian_foods_processed.csv", index=False)