<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Final-Project-Check-in" data-toc-modified-id="Final-Project-Check-in-1">Final Project Check-in</a></span></li><li><span><a href="#Group-Name" data-toc-modified-id="Group-Name-2">Group Name</a></span></li><li><span><a href="#Student-Names" data-toc-modified-id="Student-Names-3">Student Names</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-4">Load Data</a></span></li><li><span><a href="#Fit-scikit-learn-model" data-toc-modified-id="Fit-scikit-learn-model-5">Fit scikit-learn model</a></span></li><li><span><a href="#Evaluation-Metric" data-toc-modified-id="Evaluation-Metric-6">Evaluation Metric</a></span></li></ul></div>

Final Project Check-in
------

Group Name
-----

The Grace Hoppers 

Student Names
----

1. Akansha Shrivastava
2. Ivette Sulca
3. Bing Wang

Load Data
-----

In [1]:
from collections import Counter
import imblearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import geonamescache  #sudo pip install geonamescache

In [2]:
data_pd=pd.read_csv('../../data/epi_r.csv')
data_json=pd.read_json('../../data/full_format_recipes.json',)

In [3]:
#Deleting null predictive variables from the dataset
data_pd=data_pd.loc[(data_pd.rating>0) & (~data_pd.rating.isna())]

#Deleting drinks
data_pd=data_pd.loc[(data_pd.drink==0) & (data_pd.drinks==0) & (data_pd.cocktail==0)]

#Deleting repeted titles
data_pd.drop_duplicates(subset=['title'],inplace=True)  

In [4]:
# Get out X and Y
X = pd.concat([data_pd.iloc[:,0], data_pd.iloc[:,2:]], axis=1)
y = data_pd.iloc[:,1]

In [5]:
# Create feature: n_words: Proxy feature for complexity (Sum across word features, from alabama to zuccini)
X["n_words"] = X.iloc[:,11:673].sum(axis=1)

In [6]:
# Create feature: holidays: Dummy for world holiday 
# (a celebratory day attached to date in calendar-- birthday, graduation, anniversary, etc. not included)
holidays = ["bastille day", "christmas", "christmas eve", "cinco de mayo", "columbus", 
 "diwali", "easter", "father's day", "fourth of july", "friendsgiving", "halloween",
 "hanukkah", "kwanzaa", "labor day", "lunar new year", "mother's day", "new year's day",
 "new year's eve", "oktoberfest", "passover", "persian new year", "purim", "ramadan", 
 "rosh hashanah/yom kippur", "st. patrick's day", "sukkot", "thanksgiving", 
 "valentine's day"]

X["holiday"] = np.where(X[holidays].sum(axis=1)>0, 1, 0)

In [7]:
#FAT: In grams but it can mislead depending of the number of portions (Paella for example)

#Delete bigger portions(Paella) and outliers
y=y.loc[((X.fat>=0) & (X.fat<=200)) | (X.fat.isna())]
X=X.loc[((X.fat>=0) & (X.fat<=200)) | (X.fat.isna())]

#Median imputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
fat_clean = imp.fit_transform(X.fat.values.reshape(-1,1))
fat_clean = pd.DataFrame(data=fat_clean ,columns=['fat_clean'])
X['fat']=fat_clean.fat_clean.values

In [8]:
#PROTEIN CLEANING

#Again, considering values lower than 200: deleting 62 rows...
y=y.loc[((X.protein>=0) & (X.protein<=200)) | (X.protein.isna())]
X=X.loc[((X.protein>=0) & (X.protein<=200)) | (X.protein.isna())]

#Median imputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
protein_clean = imp.fit_transform(X.protein.values.reshape(-1,1))
protein_clean = pd.DataFrame(data=protein_clean ,columns=['protein_clean'])
X['protein']=protein_clean.protein_clean.values


In [9]:
# SODIUM

#Unit: miligrams
#Very different values, so we will impute only:

imp = SimpleImputer(missing_values=np.nan, strategy='median')
sodium_clean = imp.fit_transform(X.sodium.values.reshape(-1,1))
sodium_clean = pd.DataFrame(data=sodium_clean ,columns=['sodium_clean'])
X['sodium']=sodium_clean.sodium_clean.values

In [10]:
# filtering columns related to "healthy"

# selecting all the relevant columns

selected = ["fat free", "healthy", "low cal", "quick and healthy", 
"low carb",
"low cholesterol",
"low fat",
"low sodium",
"low sugar",
"low/no sugar"]

In [11]:
# filtering rows which have "selected" columns as 1 -> healthy
data_pd = X

data_pd["allhealthy"] = 0
for col in selected:
    data_pd.loc[data_pd[col] == 1, "allhealthy"] = 1

In [12]:
# replacing missing values in calories by mean

median = data_pd["calories"].median()
data_pd.loc[data_pd["calories"].isna(), "calories"] = median

In [13]:
# Adding complexity preparation from JSON file

data_json.drop_duplicates(subset=['title'],inplace=True)  
data_json['directions_n_characters']=data_json['directions'].astype(str).str.len()
data_json['ingredients_quantity']=data_json['ingredients'].str.len()
data_json['directions_n_steps']=data_json['directions'].astype(str).str.replace('[','').str.replace(']','').str.split("',").apply(lambda x: len(x))
data_json=data_json[['title','directions_n_characters','ingredients_quantity','directions_n_steps']]


data_pd=pd.merge(data_pd, data_json, on='title', how="left")

#Change by imputer
data_pd.loc[data_pd.directions_n_steps.isna(),'directions_n_steps']=0
data_pd.loc[data_pd.ingredients_quantity.isna(),'ingredients_quantity']=0
data_pd.loc[data_pd.directions_n_characters.isna(),'directions_n_characters']=0


In [14]:
data_pd.shape

(15437, 685)

In [15]:
# Identifying locations:
column_names=np.array(data_pd.columns)
column_names=[c.strip().upper() for c in column_names]

gc = geonamescache.GeonamesCache()
countries = gc.get_countries_by_names()
cities = gc.get_cities()
states = gc.get_us_states()

dict_countries=dict()
for k,v in countries.items():
    dict_countries[k.upper()]=[v['geonameid'],v['iso'],v['iso3']]

dict_countries2=dict()
for k,v in countries.items():
    dict_countries2[v['iso'].upper()]=k.upper()

dict_cities=dict()
for k,v in cities.items():
    dict_cities[v['name'].upper()]=[v['geonameid'],v['countrycode']]

dict_states=dict()    
for k,v in states.items():
    dict_states[v['name'].upper()] = [v['geonameid'],v['code'], 'US' ]


In [16]:
# Dropping locations:
#DECISION: DROP LOCATION COLUMNS(99 columns, most of them are null and is note helpful)
#CHECK AT THE END HOW MANY OF THEM ARE 

data_pd['country_id']=np.nan
data_pd['country_name']=np.nan
data_pd['state_id']=np.nan
data_pd['state_name']=np.nan
data_pd['city_id']=np.nan
data_pd['city_name']=np.nan


#1. Identifying countries, states and cities

list_col_drop=[]
for col in data_pd.columns:    
    if col.upper() in dict_countries:
        if col.upper() not in ['TURKEY']:
#            data_pd.loc[data_pd[col]==1,'country_id'] = dict_countries[col.upper()][1]
#            data_pd.loc[data_pd[col]==1,'country_name'] = col.upper()   
            list_col_drop.append(col)
 

    if col.upper() in dict_states:        
#        data_pd.loc[data_pd[col]==1,'state_id'] = dict_states[col.upper()][1]
#        data_pd.loc[data_pd[col]==1,'state_name'] = col.upper()   
#        data_pd.loc[data_pd[col]==1,'country_id'] = 'US'
#        data_pd.loc[data_pd[col]==1,'country_name'] = 'UNITED STATES' 
        list_col_drop.append(col)
 
    if col.upper() in dict_cities:     
        if col.upper() not in ['SPRING','ORANGE','WALNUT','LEEK','WEDDING','PLUM','TEQUILA','DATE','PAPAYA','MARSALA','SAKE','RYE','GOUDA','HOLIDAY']:
 #           data_pd.loc[data_pd[col]==1,'city_id'] = dict_cities[col.upper()][0]
 #           data_pd.loc[data_pd[col]==1,'city_name'] = col.upper()   
 #           data_pd.loc[data_pd[col]==1,'country_id'] = dict_cities[col.upper()][1]
 #           data_pd.loc[data_pd[col]==1,'country_name'] = dict_countries2[dict_cities[col.upper()][1]] 
            list_col_drop.append(col)

#2. Dropping individual location columns
data_pd=data_pd.drop(list_col_drop, axis=1)

Fit scikit-learn model
----

In [17]:
# Using the below features in the model:
X = data_pd[["calories", "fat", "protein", "sodium", "22-minute meals",
       "3-ingredient recipes", "n_words", "holiday", "allhealthy",
      "directions_n_characters"
       ,"ingredients_quantity",
       "directions_n_steps"
      ]]


In [18]:
# y (rating) only appears as 6 different floats between 1 and 4
# Convert floats to categorical variables, 0 to 6, for classification

y_discrete = pd.cut(y, bins=7, labels=np.arange(7), right=False)
y_discrete.value_counts()

5    6966
4    4479
6    2044
3    1281
2     445
0     123
1      99
Name: rating, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y_discrete, test_size=0.20, stratify=y_discrete)

In [20]:
clf = SVC(random_state=42).fit(X_train, y_train)

y_pred = clf.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
print(classification_report(y_train, y_pred))



[[  82    0    0    0    4   11    1]
 [   0   59    0    0    5   14    1]
 [   0    0  307    0   10   38    1]
 [   0    0    0  931   34   59    1]
 [   0    0    0    0 3519   64    0]
 [   0    0    0    0   14 5559    0]
 [   0    0    0    1   40   77 1517]]
              precision    recall  f1-score   support

           0       1.00      0.84      0.91        98
           1       1.00      0.75      0.86        79
           2       1.00      0.86      0.93       356
           3       1.00      0.91      0.95      1025
           4       0.97      0.98      0.98      3583
           5       0.95      1.00      0.98      5573
           6       1.00      0.93      0.96      1635

    accuracy                           0.97     12349
   macro avg       0.99      0.89      0.94     12349
weighted avg       0.97      0.97      0.97     12349



In [21]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

[[   0    0    0    0    0   25    0]
 [   0    0    0    0    1   19    0]
 [   0    0    0    1    2   86    0]
 [   0    0    0    0    8  247    1]
 [   0    0    0    0   34  860    2]
 [   0    0    0    2   48 1340    3]
 [   0    0    0    0   18  390    1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.00      0.00      0.00        20
           2       0.00      0.00      0.00        89
           3       0.00      0.00      0.00       256
           4       0.31      0.04      0.07       896
           5       0.45      0.96      0.61      1393
           6       0.14      0.00      0.00       409

    accuracy                           0.45      3088
   macro avg       0.13      0.14      0.10      3088
weighted avg       0.31      0.45      0.30      3088



  'precision', 'predicted', average, warn_for)


Evaluation Metric
----

In [None]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

In [None]:
print("Unweighted F1 score overall: ", round(f1_score(y_test, y_pred, average='macro'), 3))