In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
df = pd.read_csv('dataset/preprocessed_data.csv')
df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,...,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,negative,neutral,positive,compound
0,7708,60599,2005-09-02,4,very good,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.238,0.762,0.4927
1,27707,60599,2005-12-22,5,better than the real,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.508,0.492,0.4404
2,35308,60599,2006-09-26,5,absolutely awesome i was speechless when i tri...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.883,0.117,0.659
3,19399,60599,2007-03-09,5,these taste absolutely wonderful my son in law...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.675,0.325,0.8908
4,43887,60599,2008-02-20,0,made my own buttermilk w vinegar and milk. use...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.929,0.071,0.4588


In [29]:
df = df[['minutes', 'n_steps', 'ingredients', 'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates', 'food_types', 'name', 'tags', 'steps', 'description', 'compound', 'recipe_id']]

In [30]:
df.duplicated().sum()

43623

In [31]:
df = df.drop_duplicates()

In [32]:
numerical_columns = ['minutes', 'n_steps', 'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates', 'compound']
category_columns = ['food_types']
text_columns = ['ingredients', 'name', 'tags', 'steps', 'description']

In [33]:
label_encoder = LabelEncoder()
df['food_types'] = label_encoder.fit_transform(df['food_types'])

In [34]:
# vectorize the text columns
ingredient_vectorizer = TfidfVectorizer(max_features=20)
ingredients = ingredient_vectorizer.fit_transform(df['ingredients'])

In [35]:
ingredients

<898745x20 sparse matrix of type '<class 'numpy.float64'>'
	with 4643489 stored elements in Compressed Sparse Row format>

In [36]:
name_vectorizer = TfidfVectorizer(max_features=20)
names = name_vectorizer.fit_transform(df['name'])
names

<898745x20 sparse matrix of type '<class 'numpy.float64'>'
	with 800442 stored elements in Compressed Sparse Row format>

In [37]:
tags_vectorizer = TfidfVectorizer(max_features=20)
df['tags'] = df['tags'].astype(str)
tags = tags_vectorizer.fit_transform(df['tags'])
tags

<898745x20 sparse matrix of type '<class 'numpy.float64'>'
	with 12422082 stored elements in Compressed Sparse Row format>

In [38]:
steps_vectorizer = TfidfVectorizer(max_features=30)
df['steps'] = df['steps'].astype(str)
steps = steps_vectorizer.fit_transform(df['steps'])
steps

<898745x30 sparse matrix of type '<class 'numpy.float64'>'
	with 13410042 stored elements in Compressed Sparse Row format>

In [39]:
df['description']

0         these are so yummy and they do taste just like...
1         these are so yummy and they do taste just like...
2         these are so yummy and they do taste just like...
3         these are so yummy and they do taste just like...
4         these are so yummy and they do taste just like...
                                ...                        
942363    garlic and chicken what could be better, the c...
942364    my friend stacey significantly modified a reci...
942365    this is a recipe from the frugal gourmet cooki...
942366    this is a recipe from the frugal gourmet cooki...
942367    this is a recipe from the frugal gourmet cooki...
Name: description, Length: 898745, dtype: object

In [40]:
description_vectorizer = TfidfVectorizer(max_features=10)
df['description'] = df['description'].astype(str)
description = description_vectorizer.fit_transform(df['description'])
description

<898745x10 sparse matrix of type '<class 'numpy.float64'>'
	with 4435394 stored elements in Compressed Sparse Row format>

In [41]:
type(description)

scipy.sparse._csr.csr_matrix

In [42]:
# scale the numerical columns
scaler = MinMaxScaler()
all_columns = numerical_columns + category_columns
df[all_columns] = scaler.fit_transform(df[all_columns])

In [43]:
df.head()

Unnamed: 0,minutes,n_steps,ingredients,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,name,tags,steps,description,compound,recipe_id
0,0.170213,0.103093,"chicken tenders, flour, garlic powder, salt, g...",0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,kfc honey bbq strips,"60-minutes-or-less, time-to-make, main-ingredi...","mix flour, salt and pepper in bowl\nset aside\...",these are so yummy and they do taste just like...,0.745452,60599
1,0.170213,0.103093,"chicken tenders, flour, garlic powder, salt, g...",0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,kfc honey bbq strips,"60-minutes-or-less, time-to-make, main-ingredi...","mix flour, salt and pepper in bowl\nset aside\...",these are so yummy and they do taste just like...,0.719168,60599
2,0.170213,0.103093,"chicken tenders, flour, garlic powder, salt, g...",0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,kfc honey bbq strips,"60-minutes-or-less, time-to-make, main-ingredi...","mix flour, salt and pepper in bowl\nset aside\...",these are so yummy and they do taste just like...,0.829028,60599
3,0.170213,0.103093,"chicken tenders, flour, garlic powder, salt, g...",0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,kfc honey bbq strips,"60-minutes-or-less, time-to-make, main-ingredi...","mix flour, salt and pepper in bowl\nset aside\...",these are so yummy and they do taste just like...,0.945522,60599
4,0.170213,0.103093,"chicken tenders, flour, garlic powder, salt, g...",0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,kfc honey bbq strips,"60-minutes-or-less, time-to-make, main-ingredi...","mix flour, salt and pepper in bowl\nset aside\...",these are so yummy and they do taste just like...,0.728415,60599


In [47]:
# drop the text columns
df = df.drop(text_columns, axis=1)

In [48]:
df.head()

Unnamed: 0,minutes,n_steps,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,compound,recipe_id
0,0.170213,0.103093,0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,0.745452,60599
1,0.170213,0.103093,0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,0.719168,60599
2,0.170213,0.103093,0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,0.829028,60599
3,0.170213,0.103093,0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,0.945522,60599
4,0.170213,0.103093,0.261905,0.078838,0.024845,0.08658,0.127148,0.590909,0.027211,0.227273,0.5,0.728415,60599


In [49]:
df.to_csv('dataset/scaled_data.csv', index=False)

In [50]:
# merge the text columns and the numerical columns into a numpy array
from scipy.sparse import hstack
X = hstack((ingredients, names, tags, steps, description, df.values))

In [51]:
X.shape

(898745, 113)

In [52]:
X.toarray()[0]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.73442795e-01,
       0.00000000e+00, 3.08137755e-01, 0.00000000e+00, 2.88280333e-01,
       3.35748837e-01, 0.00000000e+00, 0.00000000e+00, 2.79801995e-01,
       0.00000000e+00, 0.00000000e+00, 2.50950438e-01, 3.40473067e-01,
       2.10191594e-01, 3.75090955e-01, 0.00000000e+00, 3.57348363e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.43637533e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40206401e-01,
       1.47695147e-01, 1.16770788e-01, 6.78079437e-01, 1.45069522e-01,
       1.16770529e-01, 1.45563758e-01, 0.00000000e+00, 1.16770659e-01,
      

In [53]:
# Save X to a file
from scipy.sparse import save_npz
save_npz('dataset/sparse_data.npz', X)

In [104]:
# Load X from a file
from scipy.sparse import load_npz
X = load_npz('dataset/sparse_data.npz')

In [105]:
X = X.toarray()
y = X[:, -1]
X = X[:, :-1]

In [106]:
print("X shape: ", X.shape)
print("y shape: ", y.shape)

X shape:  (898745, 112)
y shape:  (898745,)


In [107]:
X[0]

array([0.        , 0.        , 0.        , 0.37344279, 0.        ,
       0.30813776, 0.        , 0.28828033, 0.33574884, 0.        ,
       0.        , 0.279802  , 0.        , 0.        , 0.25095044,
       0.34047307, 0.21019159, 0.37509095, 0.        , 0.35734836,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.14363753, 0.        ,
       0.        , 0.        , 0.4402064 , 0.14769515, 0.11677079,
       0.67807944, 0.14506952, 0.11677053, 0.14556376, 0.        ,
       0.11677066, 0.11673818, 0.42419738, 0.11677066, 0.11676585,
       0.        , 0.        , 0.41975825, 0.09518213, 0.19111021,
       0.        , 0.        , 0.20921156, 0.09204736, 0.5501562 ,
       0.        , 0.        , 0.        , 0.13481178, 0.09532

In [108]:
y[0]

60599.0

In [76]:
# convert y to a int array
y = y.astype(int)

In [77]:
y[0]

60599

In [78]:
import faiss

In [79]:
# build the index
index = faiss.IndexFlatL2(X.shape[1])

In [80]:
index.is_trained

True

In [81]:
index.add(X)

In [82]:
index.ntotal

898745

In [83]:
# search for the nearest neighbors
recipe_id = 60599
# find the index of the recipe_id from the y array
recipe_index = np.where(y == recipe_id)[0][0]

In [84]:
recipe_index

0

In [85]:
# get the vector of the recipe
recipe_vector = X[recipe_index]

In [86]:
recipe_vector

array([0.        , 0.        , 0.        , 0.37344279, 0.        ,
       0.30813776, 0.        , 0.28828033, 0.33574884, 0.        ,
       0.        , 0.279802  , 0.        , 0.        , 0.25095044,
       0.34047307, 0.21019159, 0.37509095, 0.        , 0.35734836,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.14363753, 0.        ,
       0.        , 0.        , 0.4402064 , 0.14769515, 0.11677079,
       0.67807944, 0.14506952, 0.11677053, 0.14556376, 0.        ,
       0.11677066, 0.11673818, 0.42419738, 0.11677066, 0.11676585,
       0.        , 0.        , 0.41975825, 0.09518213, 0.19111021,
       0.        , 0.        , 0.20921156, 0.09204736, 0.5501562 ,
       0.        , 0.        , 0.        , 0.13481178, 0.09532

In [92]:
# search for the nearest neighbors
k = 100
D, I = index.search(np.array([recipe_vector]), k)

In [93]:
D

array([[0.0000000e+00, 2.9025646e-04, 6.9085072e-04, 6.9849915e-03,
        1.0885185e-02, 4.0028159e-02, 4.8853047e-02, 1.8840958e+00,
        1.9026941e+00, 1.9123696e+00, 1.9588933e+00, 1.9727566e+00,
        2.0061276e+00, 2.0770907e+00, 2.1062837e+00, 2.1114635e+00,
        2.1400244e+00, 2.1481516e+00, 2.1632843e+00, 2.1749322e+00,
        2.1801038e+00, 2.1880136e+00, 2.1928024e+00, 2.2117958e+00,
        2.2205505e+00, 2.2259893e+00, 2.2267394e+00, 2.2484381e+00,
        2.2489760e+00, 2.2494195e+00, 2.2508240e+00, 2.2515721e+00,
        2.2557635e+00, 2.2577789e+00, 2.2701278e+00, 2.2701890e+00,
        2.2709625e+00, 2.2755778e+00, 2.2801380e+00, 2.2804685e+00,
        2.2808008e+00, 2.2811487e+00, 2.2820523e+00, 2.2826390e+00,
        2.2854476e+00, 2.2859623e+00, 2.2860670e+00, 2.2908549e+00,
        2.2941165e+00, 2.2957468e+00, 2.2997351e+00, 2.3018348e+00,
        2.3020234e+00, 2.3026862e+00, 2.3031383e+00, 2.3060596e+00,
        2.3091927e+00, 2.3093123e+00, 2.3094072e

In [94]:
I

array([[     0,      4,      1,      2,      5,      3,      6, 292989,
        292988, 575387, 575386, 458064, 458065, 805298, 805299, 805300,
        560360, 220467, 220464, 220468, 482610, 220466, 220465, 220463,
        624575, 711064, 521604,  14645,  14662,  14647, 482611,  14650,
         14642,  14661,  14663,  14665,  14667,  14644,  14651,  14653,
         14656, 796858,  14658,  14660,  14666,  14668, 585577,  14655,
         14652,  14646,  14649,  14643,  14659,  14657,  14654,  14648,
        611934, 612510, 612442, 611938, 612577, 612526, 612491, 612474,
        585576, 612538, 612496, 612436, 612569, 612535, 612563, 612513,
        612425, 612530, 612450, 612516, 612485, 612453, 612427, 612583,
        612446, 612422, 612503, 612437, 612463, 612554, 612571, 612459,
        612465, 612511, 612452, 612423, 612509, 612467, 612548, 612523,
        612559, 612581, 612514, 612536]])

In [95]:
# get the recipe ids of the nearest neighbors
recipe_ids = y[I[0]]

In [96]:
recipe_ids

array([ 60599,  60599,  60599,  60599,  60599,  60599,  60599, 121160,
       121160, 180929, 180929, 127852, 127852, 149885, 149885, 149885,
        91350,  45455,  45455,  45455, 168485,  45455,  45455,  45455,
        30797,  74466, 154524,   6431,   6431,   6431, 168485,   6431,
         6431,   6431,   6431,   6431,   6431,   6431,   6431,   6431,
         6431, 178266,   6431,   6431,   6431,   6431,  70899,   6431,
         6431,   6431,   6431,   6431,   6431,   6431,   6431,   6431,
         3563,   3563,   3563,   3563,   3563,   3563,   3563,   3563,
        70899,   3563,   3563,   3563,   3563,   3563,   3563,   3563,
         3563,   3563,   3563,   3563,   3563,   3563,   3563,   3563,
         3563,   3563,   3563,   3563,   3563,   3563,   3563,   3563,
         3563,   3563,   3563,   3563,   3563,   3563,   3563,   3563,
         3563,   3563,   3563,   3563])

In [97]:
# find the unique recipe ids excluding the recipe_id
unique_recipe_ids = np.unique(recipe_ids)

In [98]:
unique_recipe_ids

array([  3563,   6431,  30797,  45455,  60599,  70899,  74466,  91350,
       121160, 127852, 149885, 154524, 168485, 178266, 180929])