In [1]:
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns


# pour parser le fichier train.json
import json

In [2]:
with open('train.json', 'r') as f:
    datas = json.load(f) # id, cuisine, ingredients[]

csv = []
occurence = dict()
for data in datas:
    for ingredient in data['ingredients']:
        cuisine = data['cuisine']
        
        # Ajout de la ligne id-cuisine-ingredient
        tmp = dict()
        tmp['id'] = str(data['id'])
        tmp['cuisine'] = cuisine
        tmp['ingredient'] = ingredient
        csv.append(tmp)
        
        # Comptage de l'occurence
        if not cuisine in occurence:
            occurence[cuisine] = dict()
        if not ingredient in occurence[cuisine]:
            occurence[cuisine][ingredient] = 0
        occurence[cuisine][ingredient] += 1

# Ajout des occurences dans le CSV
for ligne in csv:
    cuisine = ligne['cuisine']
    ingredient = ligne['ingredient']
    ligne['occurence'] = occurence[cuisine][ingredient]
    ligne['val'] = 1


# $ingredients contient tous les ingredients possibles


train = pd.DataFrame(csv)

In [3]:
df = train.copy()

In [4]:
df.head()

Unnamed: 0,id,cuisine,ingredient,occurence,val
0,10259,greek,romaine lettuce,39,1
1,10259,greek,black olives,31,1
2,10259,greek,grape tomatoes,26,1
3,10259,greek,garlic,216,1
4,10259,greek,pepper,203,1


In [5]:
# Encodage des valeurs

#le_cuisine = LabelEncoder()
#le_ingredient = LabelEncoder()

#df['cuisine'] = le_cuisine.fit_transform(df['cuisine'])
#df['ingredient'] = le_ingredient.fit_transform(df['ingredient'])

# Pour inverser l'encodage il faudra utiliser ces deux lignes :
#df['cuisine'] = le_cuisine.inverse_transform(df['cuisine'])
#df['ingredient'] = le_ingredient.inverse_transform(df['ingredient'])

#df.head()

In [6]:
df['cuisine'].unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

In [7]:
len(df['cuisine'].unique())

20

In [8]:
df['ingredient'].unique()

array(['romaine lettuce', 'black olives', 'grape tomatoes', ...,
       'lop chong', 'tomato garlic pasta sauce',
       'crushed cheese crackers'], dtype=object)

In [9]:
len(df['ingredient'].unique())

6714

In [10]:
# Pour verifier que toutes les donnees sont bonnes
df['cuisine'].isnull().sum()

0

In [11]:
# Pour verifier que toutes les donnees sont bonnes
df['ingredient'].isnull().sum()

0

In [12]:
# Creation de la matrix

index = list(df['id'].unique()) #permet de supprimer les doublons
columns = list(df['ingredient'].unique())
index = sorted(index)
columns = sorted(columns)
 
util_df = pd.pivot_table(data = df, values = 'val', index = 'id', columns = 'ingredient')

In [13]:
util_df

ingredient,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,1% low-fat cottage cheese,...,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
1000,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,,,,,,,,,,,...,,,,,,,,,,
9996,,,,,,,,,,,...,,,,,,,,,,
9997,,,,,,,,,,,...,,,,,,,,,,
9998,,,,,,,,,,,...,,,,,,,,,,


In [14]:
util_df = util_df.fillna(0)

In [15]:

X = util_df
from sklearn.decomposition import NMF
#Décomposition du modèle en deux autres modèles 
n_component = 6
model = NMF(n_components=n_component, init='random', random_state=0)
W = model.fit_transform(X) #x -> cuisine avec 2 colonnes
H = model.components_ #y -> ingedient avec 2 colonnes

W2 = pd.DataFrame(W, index, ["type_ " + str(i+1) for i in range(len(W[0]))])
H2 = pd.DataFrame(H, ["type_ " + str(i+1) for i in range(len(H))], columns)

#realiser un heatmap ,convertir nos modèles en dataframe 
print(W) 


#possibilité de faire des prédictions avec l'un des deux models, distance 

W_df = pd.DataFrame(csv)

[[0.         0.         0.12876519 0.00161134 0.         0.        ]
 [0.11214663 0.         0.01343381 0.         0.07464309 0.02471402]
 [0.01455789 0.         0.         0.07010848 0.00127487 0.07695549]
 ...
 [0.         0.03656691 0.00041074 0.         0.         0.        ]
 [0.00018086 0.         0.06515019 0.0090832  0.07218028 0.00614986]
 [0.00298885 0.00145084 0.01462306 0.         0.00203041 0.0029329 ]]


In [16]:
W3 = W2.to_numpy()
H3 = H2.to_numpy()



In [17]:
def avg_recette(tab):
    return np.mean(tab, axis=0)

def get_tab_recette(id):
    print(id)
    ingredients = df.loc[(df['id'] == id)]['ingredient']
    tab = [H2[ingr] for ingr in ingredients]
    return tab

In [18]:
# Création d'une liste des id de plats
id_plats = df['id'].unique()
id_plats
recette = get_tab_recette(id_plats[0])
avg_recette = avg_recette(recette)


tab_ingr = [avg_recette]
W_ingr = np.asarray(tab_ingr)
print(W_ingr)


10259
[[1.11496807e+00 5.42979891e-04 1.75476010e-01 4.46537865e-03
  4.19343435e-01 7.65193290e-02]]


In [19]:
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection

from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
import bokeh as bkh

In [None]:
#nos datas 
n_samples = len(W3);
seed = np.random.RandomState(seed=3)

X_cuisine= W3
X_cuisine = X_cuisine.reshape((n_samples, n_component)) ##regroupement en 2 de nos données 
similarities_cuisine = euclidean_distances(X_cuisine) ##distance eucledienne comme nous a demandé le prof 

similarities_ingr = euclidean_distances(W_ingr)

mds = manifold.MDS(n_components= 2, max_iter=3000, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
pos_cuisine = mds.fit(similarities_cuisine).embedding_
pos_ingr =mds.fit(similarities_ingr).embedding_


print(pos_cuisine)
fig = plt.figure(1)
ax = plt.axes([0., 0., 1., 1.])
s = 50

plt.scatter(pos_cuisine[:, 0], pos_cuisine[:, 1], color='turquoise', s=s, lw=0, label='cuisines')
plt.scatter(pos_ingr[:, 0], pos_ingr[:, 1], color='red', s=s, lw=0, label='ingr')


plt.legend(scatterpoints=1, loc='best', shadow=False)

for i, txt in enumerate(index):
    ax.annotate(txt, (pos_cuisine[i][0], pos_cuisine[i][1]))

In [None]:
print(X_cuisine)

In [None]:
from bokeh.plotting import figure, show, output_file
from bokeh.models.tools import HoverTool
from bokeh.layouts import layout, column
from bokeh.models import ColumnDataSource, Div
from bokeh.models.widgets import Slider, Select, TextInput
from bokeh.io import curdoc


TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"
p = figure(tools=TOOLS,  tooltips = [('cuisine', '@cuisine')])


colors = []
for i in enumerate(index):
    colors.append('blue')

source = ColumnDataSource(
    data= dict(
    cuisine = tuple(index),
    x= pos_cuisine[:, 0],
    y= pos_cuisine[:, 1],
    colors= colors, 
)
)
      
p.circle('x' ,'y', source= source, fill_color= 'colors', fill_alpha= 1, line_color=None)
p.circle(pos_ingr[:, 0], pos_ingr[:, 1], fill_color = 'red', fill_alpha = 1, line_color =None)




In [None]:
show(p)