<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Final-Project-Check-in" data-toc-modified-id="Final-Project-Check-in-1">Final Project Check-in</a></span></li><li><span><a href="#Group-Name" data-toc-modified-id="Group-Name-2">Group Name</a></span></li><li><span><a href="#Student-Names" data-toc-modified-id="Student-Names-3">Student Names</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-4">Load Data</a></span></li><li><span><a href="#Fit-scikit-learn-model" data-toc-modified-id="Fit-scikit-learn-model-5">Fit scikit-learn model</a></span></li><li><span><a href="#Evaluation-Metric" data-toc-modified-id="Evaluation-Metric-6">Evaluation Metric</a></span></li></ul></div>

Final Project Check-in
------

Group Name
-----

The Grace Hoppers 

Student Names
----

1. Akansha Shrivastava
2. Ivette Sulca
3. Bing Wang

Load Data
-----

In [1]:
from collections import Counter
import imblearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import geonamescache  #sudo pip install geonamescache

In [2]:
# data_pd = pd.read_csv('../../data/epi_r.csv')
# data_json = pd.read_json('../../data/full_format_recipes.json',)
data_pd = pd.read_csv('data/epi_r.csv')
data_json = pd.read_json('data/full_format_recipes.json',)

In [3]:
# Deleting null predictive variables from the dataset
data_pd = data_pd.loc[(data_pd.rating>0) & (~data_pd.rating.isna())]

# Deleting drinks
data_pd = data_pd.loc[(data_pd.drink==0) & (data_pd.drinks==0) & (data_pd.cocktail==0)]

# Deleting repeted titles
data_pd.drop_duplicates(subset=['title'], inplace=True)  

In [4]:
# Remove target, "rating", from data_pd
data_pd = pd.concat([data_pd.iloc[:,0], data_pd.iloc[:,2:]], axis=1)
y = data_pd.iloc[:,1]

In [5]:
# Create feature: holidays: Dummy for world holiday 
# (a celebratory day attached to date in calendar-- birthday, graduation, anniversary, etc. not included)
holidays = ["bastille day", "christmas", "christmas eve", "cinco de mayo", "columbus", 
 "diwali", "easter", "father's day", "fourth of july", "friendsgiving", "halloween",
 "hanukkah", "kwanzaa", "labor day", "lunar new year", "mother's day", "new year's day",
 "new year's eve", "oktoberfest", "passover", "persian new year", "purim", "ramadan", 
 "rosh hashanah/yom kippur", "st. patrick's day", "sukkot", "thanksgiving", 
 "valentine's day"]

data_pd["holiday"] = np.where(data_pd[holidays].sum(axis=1)>0, 1, 0)

In [6]:
#FAT: In grams but it can mislead depending of the number of portions (Paella for example)

#Delete bigger portions(Paella) and outliers
y = y.loc[((data_pd.fat>=0) & (data_pd.fat<=200)) | (data_pd.fat.isna())]
data_pd = data_pd.loc[((data_pd.fat>=0) & (data_pd.fat<=200)) | (data_pd.fat.isna())]

#Median imputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
fat_clean = imp.fit_transform(data_pd.fat.values.reshape(-1,1))
fat_clean = pd.DataFrame(data=fat_clean, columns=['fat_clean'])
data_pd['fat'] = fat_clean.fat_clean.values

In [7]:
#PROTEIN CLEANING

#Again, considering values lower than 200: deleting 62 rows...
y = y.loc[((data_pd.protein>=0) & (data_pd.protein<=200)) | (data_pd.protein.isna())]
data_pd = data_pd.loc[((data_pd.protein>=0) & (data_pd.protein<=200)) | (data_pd.protein.isna())]

#Median imputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
protein_clean = imp.fit_transform(data_pd.protein.values.reshape(-1,1))
protein_clean = pd.DataFrame(data=protein_clean ,columns=['protein_clean'])
data_pd['protein']=protein_clean.protein_clean.values


In [8]:
# SODIUM

#Unit: miligrams
#Very different values, so we will impute only:

imp = SimpleImputer(missing_values=np.nan, strategy='median')
sodium_clean = imp.fit_transform(data_pd.sodium.values.reshape(-1,1))
sodium_clean = pd.DataFrame(data=sodium_clean ,columns=['sodium_clean'])
data_pd['sodium']=sodium_clean.sodium_clean.values

In [9]:
# filtering columns related to "healthy"

# selecting all the relevant columns

selected = ["fat free", "healthy", "low cal", "quick and healthy", 
"low carb",
"low cholesterol",
"low fat",
"low sodium",
"low sugar",
"low/no sugar"]

In [10]:
# filtering rows which have "selected" columns as 1 -> healthy
data_pd["allhealthy"] = 0
for col in selected:
    data_pd.loc[data_pd[col] == 1, "allhealthy"] = 1

In [11]:
# replacing missing values in calories by mean

median = data_pd["calories"].median()
data_pd.loc[data_pd["calories"].isna(), "calories"] = median

In [12]:
# Adding complexity preparation from JSON file

data_json.drop_duplicates(subset=['title'],inplace=True)  
data_json['directions_n_characters']=data_json['directions'].astype(str).str.len()
data_json['ingredients_quantity']=data_json['ingredients'].str.len()
data_json['directions_n_steps']=data_json['directions'].astype(str).str.replace('[','').str.replace(']','').str.split("',").apply(lambda x: len(x))
data_json.loc[data_json.desc.isna(), "desc_n_characters"] = 0
data_json.loc[data_json.desc.notna(), "desc_n_characters"] = data_json["desc"].astype(str).str.len()

data_json2=data_json[['title','directions_n_characters','ingredients_quantity',
                      'directions_n_steps', 'desc_n_characters']]

data_pd=pd.merge(data_pd, data_json2, on='title', how="left")

#Change by imputer
data_pd.loc[data_pd.directions_n_steps.isna(),'directions_n_steps']=0
data_pd.loc[data_pd.ingredients_quantity.isna(),'ingredients_quantity']=0
data_pd.loc[data_pd.directions_n_characters.isna(),'directions_n_characters']=0

In [13]:
data_pd.shape

(15437, 685)

In [14]:
# Identifying locations:
column_names=np.array(data_pd.columns)
column_names=[c.strip().upper() for c in column_names]

gc = geonamescache.GeonamesCache()
countries = gc.get_countries_by_names()
cities = gc.get_cities()
states = gc.get_us_states()

dict_countries=dict()
for k,v in countries.items():
    dict_countries[k.upper()]=[v['geonameid'],v['iso'],v['iso3']]

dict_countries2=dict()
for k,v in countries.items():
    dict_countries2[v['iso'].upper()]=k.upper()

dict_cities=dict()
for k,v in cities.items():
    dict_cities[v['name'].upper()]=[v['geonameid'],v['countrycode']]

dict_states=dict()    
for k,v in states.items():
    dict_states[v['name'].upper()] = [v['geonameid'],v['code'], 'US' ]


In [15]:
# Make dummy variable of if there is a location listed or not

#1. Identifying countries, states and cities

list_col_drop=[]
for col in data_pd.columns:    
    if col.upper() in dict_countries:
        if col.upper() not in ['TURKEY']:
#            data_pd.loc[data_pd[col]==1,'country_id'] = dict_countries[col.upper()][1]
#            data_pd.loc[data_pd[col]==1,'country_name'] = col.upper()   
            list_col_drop.append(col)
 

    if col.upper() in dict_states:        
#        data_pd.loc[data_pd[col]==1,'state_id'] = dict_states[col.upper()][1]
#        data_pd.loc[data_pd[col]==1,'state_name'] = col.upper()   
#        data_pd.loc[data_pd[col]==1,'country_id'] = 'US'
#        data_pd.loc[data_pd[col]==1,'country_name'] = 'UNITED STATES' 
        list_col_drop.append(col)
 
    if col.upper() in dict_cities:     
        if col.upper() not in ['SPRING','ORANGE','WALNUT','LEEK','WEDDING','PLUM','TEQUILA','DATE','PAPAYA','MARSALA','SAKE','RYE','GOUDA','HOLIDAY']:
 #           data_pd.loc[data_pd[col]==1,'city_id'] = dict_cities[col.upper()][0]
 #           data_pd.loc[data_pd[col]==1,'city_name'] = col.upper()   
 #           data_pd.loc[data_pd[col]==1,'country_id'] = dict_cities[col.upper()][1]
 #           data_pd.loc[data_pd[col]==1,'country_name'] = dict_countries2[dict_cities[col.upper()][1]] 
            list_col_drop.append(col)

#2. Dropping individual location columns
#data_pd=data_pd.drop(list_col_drop, axis=1)

In [16]:
data_pd["locations"] = np.where(data_pd[list_col_drop].sum(axis=1)>0, 1, 0)

Fit scikit-learn model
----

In [17]:
# Using the below features in the model:
X = data_pd[["calories", "fat", "protein", "sodium", "22-minute meals",
             "3-ingredient recipes", "holiday", "allhealthy",
             "directions_n_characters", "ingredients_quantity",
             "directions_n_steps"]]

In [18]:
for col in X:
    print(col)
    print(any(X[col].isna()))
    print()

calories
False

fat
False

protein
False

sodium
False

22-minute meals
False

3-ingredient recipes
False

holiday
False

allhealthy
False

directions_n_characters
False

ingredients_quantity
False

directions_n_steps
False



In [19]:
any(y.isna())

True

In [20]:
# y (rating) only appears as 6 different floats between 1 and 4
# Convert floats to categorical variables, 0 to 6, for classification

y_discrete = pd.cut(y, bins=7, labels=np.arange(7), right=False)
y_discrete.value_counts()

0    10111
1     1896
2      272
3       48
4       13
5        9
6        2
Name: calories, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y_discrete, test_size=0.20, stratify=y_discrete)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
clf = SVC(random_state=42).fit(X_train, y_train)

y_pred = clf.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
print(classification_report(y_train, y_pred))

In [None]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

Evaluation Metric
----

In [None]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

In [None]:
print("Unweighted F1 score overall: ", round(f1_score(y_test, y_pred, average='macro'), 3))