In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv('dataset/preprocessed_data.csv')
df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,minutes,contributor_id,submitted,tags,n_steps,...,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,neg,neu,pos,compound
0,492,20636,2002-12-01,4,this worked very well and is easy. i used not ...,20,56824,2002-10-27,"30-minutes-or-less, time-to-make, course, main...",5,...,39.0,5.0,4.0,11.0,5.0,Healthy,0.0,0.598,0.402,0.8553
1,8204,64566,2005-09-02,4,very good,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",10,...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.238,0.762,0.4927
2,28657,64566,2005-12-22,5,better than the real,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",10,...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.508,0.492,0.4404
3,36365,64566,2006-09-26,5,absolutely awesome i was speechless when i tri...,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",10,...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.883,0.117,0.659
4,20197,64566,2007-03-09,5,these taste absolutely wonderful my son in law...,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",10,...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.675,0.325,0.8908


In [4]:
df = df[['minutes', 'n_steps', 'ingredients', 'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates', 'food_types', 'recipe_id']]

In [5]:
df.duplicated().sum()

763871

In [6]:
df = df.drop_duplicates()

In [7]:
numerical_columns = ['minutes', 'n_steps', 'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']
category_columns = ['food_types']
text_columns = ['ingredients']

In [8]:
def scale_data(df, numerical_columns):
    scaler = MinMaxScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df

def encode_data(df, category_columns):
    encoder = LabelEncoder()
    df[category_columns] = df[category_columns].apply(encoder.fit_transform)
    return df

def scale_and_encode_data(df, numerical_columns, category_columns):
    df = scale_data(df, numerical_columns)
    df = encode_data(df, category_columns)
    return df

In [10]:
def vectorize_data(df, text_columns):
    vectorizer = TfidfVectorizer(max_features=10)
    ingredients = vectorizer.fit_transform(df['ingredients'])
    return ingredients
# vectorizer = TfidfVectorizer(max_features=10)
# ingredients = vectorizer.fit_transform(df['ingredients'])

In [11]:
ingredients = vectorize_data(df, text_columns)
ingredients

<198249x10 sparse matrix of type '<class 'numpy.float64'>'
	with 633321 stored elements in Compressed Sparse Row format>

In [13]:
df = scale_data(df, numerical_columns)
df = encode_data(df, category_columns)
df

Unnamed: 0,minutes,n_steps,ingredients,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,recipe_id
0,0.085106,0.051546,"devils food cake mix, vegetable oil, eggs, ree...",0.071429,0.033007,0.068323,0.082979,0.017007,0.030075,0.074830,0.113636,0,20636
1,0.170213,0.103093,"chicken tenders, flour, garlic powder, salt, g...",0.261905,0.078838,0.024845,0.085106,0.125850,0.586466,0.027211,0.227273,2,64566
8,0.085106,0.020619,"hamburger, breadcrumbs, egg, salt, black peppe...",0.214286,0.076842,0.142857,0.023404,0.068027,0.406015,0.183673,0.090909,2,92891
9,0.638298,0.144330,"lamb shoulder, salt, ground black pepper, vege...",0.357143,0.151315,0.403727,0.025532,0.115646,0.488722,0.564626,0.159091,2,94130
13,0.191489,0.175258,"cooking spray, onion, green peppers, garlic cl...",0.357143,0.061873,0.024845,0.059574,0.071429,0.330827,0.020408,0.250000,0,105129
...,...,...,...,...,...,...,...,...,...,...,...,...,...
962097,0.055319,0.134021,"water, coffee, ground cardamom, whole cloves",0.071429,0.004067,0.000000,0.000000,0.000000,0.000000,0.000000,0.022727,3,193314
962099,0.255319,0.185567,"olive oil, onion, ground cumin, chili powder, ...",0.452381,0.066638,0.037267,0.051064,0.085034,0.097744,0.020408,0.409091,0,124429
962102,0.297872,0.092784,"whole chicken, cider vinegar, crushed red pepp...",0.095238,0.126216,0.273292,0.040426,0.061224,0.473684,0.265306,0.181818,2,115134
962107,0.034043,0.257732,"shrimp, garlic cloves, olive oil",0.047619,0.099047,0.385093,0.000000,0.027211,0.075188,0.190476,0.022727,3,188792


In [108]:
df = df.drop('ingredients', axis=1)

In [88]:
# drop recipe_id column and food_types column
temp = df.drop(['food_types'], axis=1)
# save to a file
temp.to_csv('../dataset/temp.csv', index=False)

In [87]:
temp.head()

Unnamed: 0,minutes,n_steps,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,0.085106,0.051546,0.071429,0.033007,0.068323,0.082979,0.017007,0.030075,0.07483,0.113636
1,0.170213,0.103093,0.261905,0.078838,0.024845,0.085106,0.12585,0.586466,0.027211,0.227273
8,0.085106,0.020619,0.214286,0.076842,0.142857,0.023404,0.068027,0.406015,0.183673,0.090909
9,0.638298,0.14433,0.357143,0.151315,0.403727,0.025532,0.115646,0.488722,0.564626,0.159091
13,0.191489,0.175258,0.357143,0.061873,0.024845,0.059574,0.071429,0.330827,0.020408,0.25


In [75]:
ingredients

<198249x4071 sparse matrix of type '<class 'numpy.float64'>'
	with 3082309 stored elements in Compressed Sparse Row format>

In [109]:
# merge the ingredients and the dataframe into a numpy array
from scipy.sparse import hstack
X = hstack((df.values, ingredients))

In [110]:
X.shape

(198249, 22)

In [84]:
X.toarray()

# drop the recipe_id column from X which is the 11th column
X = np.delete(X.toarray(), 11, axis=1)

array([[0.08510638, 0.05154639, 0.07142857, ..., 0.        , 0.        ,
        0.        ],
       [0.17021277, 0.10309278, 0.26190476, ..., 0.        , 0.        ,
        0.        ],
       [0.08510638, 0.02061856, 0.21428571, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29787234, 0.09278351, 0.0952381 , ..., 0.        , 0.        ,
        0.        ],
       [0.03404255, 0.25773196, 0.04761905, ..., 0.        , 0.        ,
        0.        ],
       [0.0893617 , 0.09278351, 0.16666667, ..., 0.        , 0.        ,
        0.        ]])

In [83]:
df

Unnamed: 0,minutes,n_steps,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,recipe_id
0,0.085106,0.051546,0.071429,0.033007,0.068323,0.082979,0.017007,0.030075,0.074830,0.113636,0,20636
1,0.170213,0.103093,0.261905,0.078838,0.024845,0.085106,0.125850,0.586466,0.027211,0.227273,2,64566
8,0.085106,0.020619,0.214286,0.076842,0.142857,0.023404,0.068027,0.406015,0.183673,0.090909,2,92891
9,0.638298,0.144330,0.357143,0.151315,0.403727,0.025532,0.115646,0.488722,0.564626,0.159091,2,94130
13,0.191489,0.175258,0.357143,0.061873,0.024845,0.059574,0.071429,0.330827,0.020408,0.250000,0,105129
...,...,...,...,...,...,...,...,...,...,...,...,...
962097,0.055319,0.134021,0.071429,0.004067,0.000000,0.000000,0.000000,0.000000,0.000000,0.022727,3,193314
962099,0.255319,0.185567,0.452381,0.066638,0.037267,0.051064,0.085034,0.097744,0.020408,0.409091,0,124429
962102,0.297872,0.092784,0.095238,0.126216,0.273292,0.040426,0.061224,0.473684,0.265306,0.181818,2,115134
962107,0.034043,0.257732,0.047619,0.099047,0.385093,0.000000,0.027211,0.075188,0.190476,0.022727,3,188792


In [81]:
# Save X to a file
from scipy.sparse import save_npz
save_npz('../dataset/X.npz', X)

In [80]:
# NMF
from sklearn.decomposition import NMF
nmf = NMF(n_components=10, init='random', random_state=0, max_iter=100, solver='mu', l1_ratio=0.5, verbose=3)
nmf.fit(X)

Epoch 10 reached after 1.190 seconds, error: 832.952580
Epoch 20 reached after 2.278 seconds, error: 830.336077


array([[5.45293529e-01, 1.61059598e-02, 1.03740316e-01, ...,
        4.39226529e-01, 2.19028475e+00, 3.74169385e-01],
       [2.25579544e+00, 1.64887975e+00, 1.47846514e-01, ...,
        3.16066477e+00, 2.57140414e+00, 7.81000233e-02],
       [3.81587212e+00, 2.30527029e+00, 8.55829351e-01, ...,
        2.50566999e+00, 1.13939248e+01, 1.18780225e+00],
       ...,
       [3.03137388e+00, 4.23270042e+00, 4.58001520e-01, ...,
        4.99773255e+00, 3.32886290e+00, 3.73006803e-01],
       [8.44307137e+00, 3.47026011e+00, 4.02414928e-01, ...,
        2.30482003e-01, 2.45017831e+01, 2.81374513e+00],
       [8.30009995e+00, 2.90378511e+00, 8.02449707e-01, ...,
        7.73255968e+00, 1.39341443e+01, 2.42486204e+00]])

In [119]:
# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# remove 11th column from X and create a new array
d = np.delete(X, 11, axis=1)
d

AxisError: axis 1 is out of bounds for array of dimension 0

In [117]:
recipes = X.toarray()[:, 11]
recipes

array([ 20636.,  64566.,  92891., ..., 115134., 188792., 165993.])

In [114]:
d

array([[0.08510638, 0.05154639, 0.07142857, ..., 0.        , 0.        ,
        0.        ],
       [0.17021277, 0.10309278, 0.26190476, ..., 0.40339532, 0.35194097,
        0.        ],
       [0.08510638, 0.02061856, 0.21428571, ..., 0.83049982, 0.24152231,
        0.        ],
       ...,
       [0.29787234, 0.09278351, 0.0952381 , ..., 1.        , 0.        ,
        0.        ],
       [0.03404255, 0.25773196, 0.04761905, ..., 0.        , 0.        ,
        0.        ],
       [0.0893617 , 0.09278351, 0.16666667, ..., 0.54220061, 0.4730412 ,
        0.        ]])

In [112]:
cosine_similarities

array([1., 1., 1., ..., 1., 1., 1.])

In [92]:
# top 10 similar recipes
related_docs_indices = cosine_similarities.argsort()[:-11:-1]
related_docs_indices

array([     0,  41919,  32690, 157960,  16766,  73882, 183458,  48214,
        16827,  55076])

In [94]:
d

array([[0.08510638, 0.05154639, 0.07142857, ..., 0.        , 0.        ,
        0.        ],
       [0.17021277, 0.10309278, 0.26190476, ..., 0.        , 0.        ,
        0.        ],
       [0.08510638, 0.02061856, 0.21428571, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29787234, 0.09278351, 0.0952381 , ..., 0.        , 0.        ,
        0.        ],
       [0.03404255, 0.25773196, 0.04761905, ..., 0.        , 0.        ,
        0.        ],
       [0.0893617 , 0.09278351, 0.16666667, ..., 0.        , 0.        ,
        0.        ]])