### Using tSNE to visualize the ingredient space for recipes

In [1]:
import pandas as pd
import numpy as np
import math
import re
from ingredient_parser.en import parse 

%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144

#### Process data

In [6]:
import pickle
with open('df.p', 'rb') as f:
    df = pickle.load(f)

In [7]:
import pickle
with open('df_ingre.p', 'rb') as f:
    df_ingre = pickle.load(f)
del df_ingre['title']
del df_ingre['ingre1']
del df_ingre['parsed']
df_ingre.head(1)

Unnamed: 0,recipe_id,fava beans,ale,mortadella,skate,parmesan,passion fruit,milk,blueberries,bell peppers,...,jerusalem artichoke,wonton,caraway,potato,food color,daikon,soy sauce,sesame seeds,beverage,nutritional yeast flakes
0,70404,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
df_ingre2 = df_ingre[df_ingre['recipe_id'].isin(df[df['Cat1'].isin(['World Cuisine','U.S. Recipes'])]['recipe_id'].tolist())]

In [55]:
df_ingre3 = df_ingre2.copy()
del df_ingre3['recipe_id']

#### Apply tSNE

In [56]:
from sklearn.manifold import TSNE
#from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

In [57]:
X = [[row[i] for i in xrange(len(row))] for row in df_ingre3.to_records(index=False)]

In [58]:
#X[:2]

In [150]:
#X_reduced = TruncatedSVD(n_components=40, random_state=0).fit_transform(X)

X_embedded = TSNE(n_components=2, perplexity=10, verbose=2).fit_transform(X)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 1642
[t-SNE] Computed conditional probabilities for sample 1642 / 1642
[t-SNE] Mean sigma: 0.866777
[t-SNE] Iteration 25: error = 2.2612438, gradient norm = 0.0148114
[t-SNE] Iteration 50: error = 2.1467071, gradient norm = 0.0150796
[t-SNE] Iteration 75: error = 1.6939532, gradient norm = 0.0118731
[t-SNE] Iteration 100: error = 1.6746691, gradient norm = 0.0123193
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.674669
[t-SNE] Iteration 125: error = 1.4825355, gradient norm = 0.0087352
[t-SNE] Iteration 150: error = 1.5094644, gradient norm = 0.0105704
[t-SNE] Iteration 175: error = 1.5181737, gradient norm = 0.0109404
[t-SNE] Iteration 175: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 175 iterations: 1.674669


In [60]:
len(df_ingre2) - len(df_ingre2['recipe_id'].drop_duplicates())

0

In [61]:
len(df_ingre2)

1642

In [63]:
df_merged = df_ingre2.merge(df, how = 'left', on = ['recipe_id'])
len(df_merged)

1642

In [82]:
def new_column (row):
    if 'U.S.' in (row['Cat1']) :
        return 'U.S. Recipes'
    if row['Cat1']  == 'World Cuisine':
        return row['Cat2']
df_merged['new_Cat'] = df_merged.apply (lambda row: new_column (row),axis=1)

In [83]:
df_merged['new_Cat'] = df_merged['new_Cat'].fillna('NA')

In [84]:
#df_merged['Cat2'].fillna('NA').drop_duplicates().tolist()

In [113]:
from collections import defaultdict
d = dict()
counter = 0
for idx, item in enumerate(df_merged['new_Cat'].fillna('NA').drop_duplicates().tolist()):
    d[item] = idx
    #d[item[0]] = counter
    #counter +=1
d

{'African': 5,
 'Asian': 3,
 'Australian and New Zealander': 7,
 'Canadian': 4,
 'European': 1,
 'Latin American': 0,
 'Middle Eastern': 8,
 'NA': 6,
 'U.S. Recipes': 2}

In [88]:
df_merged['new_Cat_num'] = df_merged['new_Cat'].apply(lambda x:d[x])

In [98]:
Y = [item[0] for  item in df_merged['new_Cat_num'].to_frame().to_records(index=False)]

In [100]:
len(Y)

1642

#### Visualize recipes using tSNE

In [104]:
from bokeh.charts import Bar, output_notebook, show
output_notebook()


In [151]:
import bokeh.plotting as bp
from bokeh.models import HoverTool 
from bokeh.palettes import brewer

colors = brewer["Spectral"][len(d)]

#colors = [color[item] for item in df_ingre['cluster'].tolist()]]

fig = bp.figure(tools="reset,hover")

s1 = fig.scatter(x=X_embedded[:, 0],y=X_embedded[:, 1],legend='Y',fill_alpha = 1, color=[colors[item] for item in Y],size=5)
fig.select(dict(type=HoverTool)).tooltips = {"x":"$x", "y":"$y"}
show(fig)

In [122]:
len(X_embedded[:, 0])
X_embedded

array([[ -0.81237272, -12.5551292 ],
       [ -8.37029146,  -9.40638049],
       [ -1.9676582 , -12.13747254],
       ..., 
       [ -1.62387454,   4.48953435],
       [  8.16016583,  12.62844313],
       [ -1.62664255,  -6.79889806]])

#### Saving data for D3 visualization

In [152]:
tsv = pd.DataFrame(data=X_embedded,columns=['X', 'Y'])

In [153]:
tsv['Cat'] = df_merged['new_Cat']

In [154]:
tsv.to_csv('scatter.tsv',sep='\t',header=True, index=False)

In [134]:
df_merged.head(1)

Unnamed: 0,recipe_id,fava beans,ale,mortadella,skate,parmesan,passion fruit,milk,blueberries,bell peppers,...,lg_no_ratings,lg_rating,lg_prep_time_total,lg_cook_time_total,lg_ready_in_total,lg_no_ingre,lg_no_steps,ingre_only,new_Cat,new_Cat_num
0,70404,0,0,0,0,0,0,0,0,0,...,3.078819,0.740363,1.20412,1.491362,1.662758,1.230449,0.60206,"{""1 pound ground beef"",""1/2 cup chopped onion""...",Latin American,0


In [135]:
import pickle
with open('df_merged.p', 'wb') as f:
    pickle.dump(df_merged, f)