In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

In [None]:
#DataFrame manipulation
data = pd.read_json("train.json")
data = data.set_index("id")
data = data.sort_values("id")
data.head()

In [None]:
data.isnull().sum()

In [None]:
# Cuisines in the Dataset
print("Number of Cuisines :", len(data["cuisine"].unique()))

# Recipes in each cuisine
print(data["cuisine"].value_counts())
sb.catplot(y = "cuisine", data = data, kind = "count")

In [None]:
#Finding the different ingredients stored in the data
from collections import Counter
from itertools import chain, combinations
counted_ingredients = Counter(chain.from_iterable(data.ingredients.tolist())) # Builds a counter to count the occurences of each single ingredient
single_ingredients = list(counted_ingredients.keys()) # Builds a list will all single ingredients
n_single_ingredients = len(single_ingredients) # Counts the number of single ingredients
print("Number of single ingredients = " + str(n_single_ingredients) + "\n")

specific_ingredients = {} # Will be a dict of lists
sorted_ingredients = {} # Will be a dict of Counters
sorted_not_specific_ingredients = {} # Will be a dict of lists
not_specific_ingredients = single_ingredients # Will be a list

for cuisine in single_cuisines :
    sorted_ingredients[cuisine] = Counter(chain.from_iterable(data[data.cuisine == cuisine].ingredients.tolist()))
    other_cuisines_ingredients = Counter(chain.from_iterable(data[data.cuisine != cuisine].ingredients.tolist()))
    intersection  = Counter(sorted_ingredients[cuisine]) & Counter(other_cuisines_ingredients)
    specific_ingredients[cuisine] = list(sorted_ingredients[cuisine] - intersection)
    not_specific_ingredients = list(Counter(not_specific_ingredients) - Counter(specific_ingredients[cuisine]))
    sorted_not_specific_ingredients[cuisine] = list(sorted_ingredients[cuisine] - Counter(specific_ingredients[cuisine]))
    # Print info about current cuisine
    print(cuisine + " cuisine: \t" + str(len(specific_ingredients[cuisine])) + " specific ingredients, \t"+ str(len(sorted_ingredients[cuisine])) + "  different ingredients")
 
sum_specific_ingredients = sum(len(v) for v in specific_ingredients.values())
sum_not_specific_ingredients = len(not_specific_ingredients)

print("\n" + str(sum_specific_ingredients) + " ingredients are specific to one cuisine and \n" + str(sum_not_specific_ingredients)+" ingredients are not specific to one cuisine")

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected = False)

trace1 = go.Bar(
    x=['spanish', 'mexican', 'french', 'chinese', 'italian',
       'southern_us', 'indian', 'thai', 'cajun_creole', 'jamaican',
       'japanese', 'greek', 'russian', 'irish', 'moroccan', 'korean',
       'filipino', 'vietnamese', 'british', 'brazilian'],
    y=[64,716,288,317,852,
       446,270,121,148,42,
       213,86,56,49,48,64,
       78,65,110,59],
    name='specific ingredients'
)
trace2 = go.Bar(
    x=['spanish', 'mexican', 'french', 'chinese', 'italian',
       'southern_us', 'indian', 'thai', 'cajun_creole', 'jamaican',
       'japanese', 'greek', 'russian', 'irish', 'moroccan', 'korean',
       'filipino', 'vietnamese', 'british', 'brazilian'],
    y=[1263,2684,2102,1792,2929,
       2462,1664,1376,1576,877,
       1439,1198,872,999,974,898,
       947,1108,1166,853],
    name='different ingredients'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

In [None]:
import operator
dc_sort = sorted(counted_ingredients.items(),key = operator.itemgetter(1),reverse=True)
top100_ingredients = dc_sort[0:49]

In [None]:

x = np.array(list(dict(top100_ingredients).keys()))
y = np.array(list(dict(top100_ingredients).values()))
colors = np.random.rand(50)
sz = 20

fig = go.Figure()
fig.add_scatter(x=x,
                y=y,
                mode='markers',
                marker={'size': sz,
                        'color': colors,
                        'opacity': 0.6,
                        'colorscale': 'Viridis'
                       });
py.iplot(fig)