# Final Project - Food.com Ratings Classification


_Name_:  **Jimmy Nguyen**, **Jose Luis Estrada**, **Ashutosh Singh**

_Class Assignment_: **ADS 504 Final Project - Baseline Models**

# Packages

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import os
import json
import re
import random
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import plot_confusion_matrix
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

import warnings # warnings packagedefine bust size
warnings.filterwarnings('ignore') # hide warnings
%matplotlib inline  
plt.style.use('seaborn')
pd.set_option('display.max_colwidth', None)

---- 

# Linear Classifier (Logistic) Model

## Interactions Data 

In [2]:
interact = pd.read_csv("data/RAW_interactions.csv")
interact.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for 15 minutes.Added a shake of cayenne and a pinch of salt. Used low fat sour cream. Thanks.
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall evening. Should have doubled it ;)<br/><br/>Second time around, forgot the remaining cumin. We usually love cumin, but didn't notice the missing 1/2 teaspoon!"
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not quite a whole package (10oz) of white chips. Great!
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunko. Everyone loved it.
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprinkling of black pepper. Yum!"


### Select only needed columns 

In [3]:
interact = interact[['recipe_id','review','rating']]
interact.head()

Unnamed: 0,recipe_id,review,rating
0,40893,Great with a salad. Cooked on top of stove for 15 minutes.Added a shake of cayenne and a pinch of salt. Used low fat sour cream. Thanks.,4
1,40893,"So simple, so delicious! Great for chilly fall evening. Should have doubled it ;)<br/><br/>Second time around, forgot the remaining cumin. We usually love cumin, but didn't notice the missing 1/2 teaspoon!",5
2,44394,This worked very well and is EASY. I used not quite a whole package (10oz) of white chips. Great!,4
3,85009,I made the Mexican topping and took it to bunko. Everyone loved it.,5
4,85009,"Made the cheddar bacon topping, adding a sprinkling of black pepper. Yum!",5


In [4]:
interact.shape

(1132367, 3)

### Encode Ratings to Positive and Negative Classes

In [5]:
interact = interact[interact['rating'] != 3]
interact['Positively_Rated'] = np.where(interact['rating'] < 3,1,0)
interact = interact.drop(['rating'], axis = 1)

In [6]:
interact['Positively_Rated'].value_counts()

0    1003724
1      87788
Name: Positively_Rated, dtype: int64

### Handle Missing Values 

In [7]:
interact = interact.dropna()

In [8]:
interact.isnull().sum()

recipe_id           0
review              0
Positively_Rated    0
dtype: int64

### Final Class Proportion 

In [9]:
interact['Positively_Rated'].value_counts() / interact.shape[0]

0    0.919564
1    0.080436
Name: Positively_Rated, dtype: float64

### Merge Interact with Recipes

In [10]:
%%time

recipes = pd.read_csv("data/RAW_recipes.csv")

df = pd.merge(recipes, interact, how = "inner", left_on = 'id',
             right_on = 'recipe_id')

CPU times: user 3.8 s, sys: 177 ms, total: 3.98 s
Wall time: 3.99 s


## Convert Review Text into Features

In [11]:
corpus_df = df[['review']]
corpus = corpus_df['review'].tolist()
corpus[:1]

[' I used an acorn squash and recipe#137681 Sweet Mexican spice blend. Only used 1 tsp honey & 1 tsp butter between both halves,, sprinkled the squash liberally with the spice mix. Baked covered for 45 minutes uncovered or 15.  I basted the squash   with the the butter/honey from the cavity  allowing it to get a golden color.  Lovely Squash recipe Thanks Cookgirl']

In [12]:
%%time

vectorizer = TfidfVectorizer(stop_words='english')
reviews = vectorizer.fit_transform(corpus)
reviews = pd.DataFrame.sparse.from_spmatrix(reviews, columns = vectorizer.get_feature_names())
reviews = reviews.drop(['wasn', '039',
                       'quot','sounds',
                       'tag','posting'], axis = 1)
reviews.head()

CPU times: user 1min 48s, sys: 928 ms, total: 1min 49s
Wall time: 1min 49s


Unnamed: 0,00,000,0000,000000,0000001,0000laalaa,000170,000ft,000g,000mg,...,œvolcano,œwasteâ,œwe,œwhat,œwhiteâ,œwow,œyes,œzipâ,šo,šopsky
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection - Top 20 Features 

In [13]:
y = df['Positively_Rated']

In [14]:
selector = SelectKBest(chi2, k=20).fit(reviews, y)
cols = selector.get_support(indices=True)
top_20 = reviews.iloc[:,cols]
top20 = pd.DataFrame(top_20.columns, columns = ["Top 20 Features"])
top20

Unnamed: 0,Top 20 Features
0,awful
1,bad
2,bland
3,disappointed
4,disappointing
5,disgusting
6,easy
7,edible
8,horrible
9,inedible


In [15]:
X = reviews.iloc[:,cols]
X.head()

Unnamed: 0,awful,bad,bland,disappointed,disappointing,disgusting,easy,edible,horrible,inedible,ok,sorry,tasteless,terrible,thanks,waste,wasted,worst,wrong,yuck
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046001,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127171,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
X.columns

Index(['awful', 'bad', 'bland', 'disappointed', 'disappointing', 'disgusting',
       'easy', 'edible', 'horrible', 'inedible', 'ok', 'sorry', 'tasteless',
       'terrible', 'thanks', 'waste', 'wasted', 'worst', 'wrong', 'yuck'],
      dtype='object')

In [None]:
X.shape

# Baseline Model 

### L1

In [None]:
alphas = [0.0001,0.001,0.01,0.1,1,10,100,1000]
penalty = 'l1'

In [None]:
%%time
l1_scores = []

for a in alphas:
    log  = make_pipeline(SGDClassifier(loss = "log",
                                       penalty = penalty,
                                       alpha = a,
                                       class_weight = "balanced",
                                       max_iter=1000, 
                                       tol=1e-3,
                                       random_state = 1))
    cv =  StratifiedKFold(n_splits=5, random_state = None) 
    avg_score = np.mean(cross_val_score(log, X, y,
                                        cv=cv,scoring='f1',
                                        n_jobs=-1))
    
    print("Current Alpha:", a, ' Average F-1 Score: ', round(avg_score,4))
    l1_scores.append(avg_score)
    

### L2

In [None]:
penalty = 'l2'

In [None]:
%%time
l2_scores = []

for a in alphas:
    log  = make_pipeline(SGDClassifier(loss = "log",
                                       penalty = penalty,
                                       alpha = a,
                                       class_weight = "balanced",
                                       max_iter=1000, 
                                       tol=1e-3,
                                       random_state = 1))
    cv =  StratifiedKFold(n_splits=5, random_state = None) 
    avg_score = np.mean(cross_val_score(log, X, y,
                                        cv=5,scoring='f1',
                                        n_jobs=-1))
    
    print("Current Alpha:", a, ' Average F-1 Score: ', round(avg_score,4))
    l2_scores.append(avg_score) 

## Table Report 

In [None]:
table_report = {'Alphas':alphas,
               'L1 Penalty - Accuracy': l1_scores,
               'L2 Penalty - Accuracy': l2_scores}

table_df = pd.DataFrame(table_report)
table_df = table_df.set_index('Alphas')
table_df

## Regularization Accuracy Plot

In [None]:
fig,ax=plt.subplots(figsize=(7,5))
ax.plot(table_df.index,
        table_df["L1 Penalty - Accuracy"],
       label = 'L1 Penalty')
ax.set(xscale="log")

ax.plot(table_df.index,
        table_df["L2 Penalty - Accuracy"],
       label = 'L2 Penalty')
ax.set(xscale="log")

plt.xlabel('Alphas')
plt.ylabel('Average F-1 Score')
plt.legend()
plt.show()

## Optimal Baseline Model 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
penalty = 'l2'
alpha = 0.1

In [None]:
log  = make_pipeline(SGDClassifier(loss = "log",
                                       penalty = penalty,
                                       alpha = alpha,
                                       class_weight = "balanced",
                                       max_iter=1000, 
                                       tol=1e-3,
                                       random_state = 1))

In [None]:
# Fit Model
log.fit(X_train,y_train)

# make predictions
ypred = log.predict(X_test)

from sklearn.metrics import f1_score
# evaluate predictions
f1 = f1_score(y_test, ypred)
print('F1: ', f1)

## Classification Report 

In [None]:
print(classification_report(y_test, ypred))

-------------------

# Improving the Baseline Model 

## Feature Engineering

In [17]:
%%time
df[['calories',
    'total fat',
    'sugar','sodium',
    'protein',
    'saturated fat',
    'carbohydrates']] = df.nutrition.str.split(",",expand=True) 

df['calories'] =  df['calories'].apply(lambda x: x.replace('[',''))
df['carbohydrates'] =  df['carbohydrates'].apply(lambda x: x.replace(']','')) 

df[['calories',
    'total fat ',
    'sugar',
    'sodium',
    'protein',
    'saturated fat',
    'carbohydrates']] = df[['calories',
                                  'total fat',
                                  'sugar',
                                  'sodium',
                                  'protein',
                                  'saturated fat',
                                  'carbohydrates']].astype('float')


df.drop(['id', 'name', 'nutrition',
         'ingredients','recipe_id'], axis=1,inplace = True)
df = df.iloc[:,:-1]
df.head()

CPU times: user 5.84 s, sys: 732 ms, total: 6.57 s
Wall time: 6.59 s


Unnamed: 0,minutes,contributor_id,submitted,tags,n_steps,steps,description,n_ingredients,review,Positively_Rated,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates
0,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",11,"['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking', 'if desired , season with salt']","autumn is my favorite time of year to cook! this recipe \r\ncan be prepared either spicy or sweet, your choice!\r\ntwo of my posted mexican-inspired seasoning mix recipes are offered as suggestions.",7,"I used an acorn squash and recipe#137681 Sweet Mexican spice blend. Only used 1 tsp honey & 1 tsp butter between both halves,, sprinkled the squash liberally with the spice mix. Baked covered for 45 minutes uncovered or 15. I basted the squash with the the butter/honey from the cavity allowing it to get a golden color. Lovely Squash recipe Thanks Cookgirl",0,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",11,"['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking', 'if desired , season with salt']","autumn is my favorite time of year to cook! this recipe \r\ncan be prepared either spicy or sweet, your choice!\r\ntwo of my posted mexican-inspired seasoning mix recipes are offered as suggestions.",7,This was a nice change. I used butternut squash and the sweet option using a good local honey and unsalted butter. I did not add salt. We ate this on top of recipe#322603 with Balkan yogurt. I may make this again same option. Made for Ramadan Tag 2010.,0,51.5,0.0,13.0,0.0,2.0,0.0,4.0
2,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",11,"['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking', 'if desired , season with salt']","autumn is my favorite time of year to cook! this recipe \r\ncan be prepared either spicy or sweet, your choice!\r\ntwo of my posted mexican-inspired seasoning mix recipes are offered as suggestions.",7,Excellent recipe! I used butternut squash and the sweet option. The mexican spice mix put this over the top. Thanks for sharing.,0,51.5,0.0,13.0,0.0,2.0,0.0,4.0
3,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'breakfast', 'main-dish', 'pork', 'american', 'oven', 'easy', 'kid-friendly', 'pizza', 'dietary', 'northeastern-united-states', 'meat', 'equipment']",9,"['preheat oven to 425 degrees f', 'press dough into the bottom and sides of a 12 inch pizza pan', 'bake for 5 minutes until set but not browned', 'cut sausage into small pieces', 'whisk eggs and milk in a bowl until frothy', 'spoon sausage over baked crust and sprinkle with cheese', 'pour egg mixture slowly over sausage and cheese', 's& p to taste', 'bake 15-20 minutes or until eggs are set and crust is brown']",this recipe calls for the crust to be prebaked a bit before adding ingredients. feel free to change sausage to ham or bacon. this warms well in the microwave for those late risers.,6,"Have not tried this, but it sounds delicious. Reminds me of a layover I had at the Atlanta airport. I had a ham, egg, and cheese pizza at one of the pizza chain places on the concourse. About $2.99 with coffee... It was one of the best breakfast dishes I ever had! (But a strange place to find a delicious breakfast...lol)",1,173.4,18.0,0.0,17.0,22.0,35.0,1.0
4,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'breakfast', 'main-dish', 'pork', 'american', 'oven', 'easy', 'kid-friendly', 'pizza', 'dietary', 'northeastern-united-states', 'meat', 'equipment']",9,"['preheat oven to 425 degrees f', 'press dough into the bottom and sides of a 12 inch pizza pan', 'bake for 5 minutes until set but not browned', 'cut sausage into small pieces', 'whisk eggs and milk in a bowl until frothy', 'spoon sausage over baked crust and sprinkle with cheese', 'pour egg mixture slowly over sausage and cheese', 's& p to taste', 'bake 15-20 minutes or until eggs are set and crust is brown']",this recipe calls for the crust to be prebaked a bit before adding ingredients. feel free to change sausage to ham or bacon. this warms well in the microwave for those late risers.,6,This recipe was wonderful. Instead of using the precooked sausage I substituted uncooked sausage then cooked and drained it. It turned out perfect!,0,173.4,18.0,0.0,17.0,22.0,35.0,1.0


In [18]:
df.shape

(1091346, 17)

## Combine with Review Data 

In [None]:
%%time
top_20 = reviews.iloc[:,cols]
final_df = pd.DataFrame(np.hstack([df, top_20]),
                    columns = df.columns.tolist() + top_20.columns.tolist())
final_df = final_df.drop(['review'], axis = 1)

In [None]:
final_df = final_df.drop_duplicates(subset=['minutes',
                           'n_steps',
                           'n_ingredients',
    'calories',
    'total fat',
    'sugar','sodium',
    'protein',
    'saturated fat',
    'carbohydrates'])

final_df.head()

In [None]:
final_df.shape

## Final Linear Classifier

In [None]:
X = final_df.drop(['rating'], axis = 1)
X.head()

In [None]:
y = final_df['rating']
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
rating_labels = [str(i) for i in le.classes_]
rating_labels

### L1

In [None]:
alphas = [0.0001,0.001,0.01,0.1,1,10,100,1000]
penalty = 'l1'

In [None]:
%%time
l1_scores = []

for a in alphas:
    log  = make_pipeline(StandardScaler(),
                         SGDClassifier(loss = "log",
                                       penalty = penalty,
                                       alpha = a,
                                       #class_weight = "balanced",
                                       max_iter=1000, 
                                       tol=1e-3,
                                       random_state = 1))
    print("Current Alpha:", a)
    avg_score = np.mean(cross_val_score(log, X, y,
                                        cv=5,scoring='accuracy',
                                        n_jobs=-1))
    print("Current Average Score:", avg_score)
    l1_scores.append(avg_score)
    

### L2

In [None]:
penalty = 'l2'

In [None]:
%%time
l2_scores = []

for a in alphas:
    log  = make_pipeline(StandardScaler(),
                         SGDClassifier(loss = "log",
                                       penalty = penalty,
                                       alpha = a,
                                       #class_weight = "balanced",
                                       max_iter=1000, 
                                       tol=1e-3,
                                       random_state = 1))
    print("Current Alpha:", a)
    avg_score = np.mean(cross_val_score(log, X, y,
                                        cv=5,
                                        scoring='accuracy',
                                        n_jobs=-1))
    print("Current Average Score:", avg_score)
    l2_scores.append(avg_score)
    

### Table Report 

In [None]:
table_report = {'Alphas':alphas,
               'L1 Penalty - Accuracy': l1_scores,
               'L2 Penalty - Accuracy': l2_scores}

table_df = pd.DataFrame(table_report)
table_df = table_df.set_index('Alphas')
table_df

### Regularization Accuracy Plot 

In [None]:
fig,ax=plt.subplots(figsize=(7,5))
ax.plot(table_df.index,
        table_df["L1 Penalty - Accuracy"],
       label = 'L1 Penalty')
ax.set(xscale="log")

ax.plot(table_df.index,
        table_df["L2 Penalty - Accuracy"],
       label = 'L2 Penalty')
ax.set(xscale="log")

plt.legend()
plt.show()

## Final Model 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Optimal Parameters
penalty = 'l1'
alpha = 0.0001

# Model
log  = make_pipeline(StandardScaler(),
                     SGDClassifier(loss = "log",
                                   penalty = penalty,
                                   alpha = alpha,
                                   max_iter=1000, 
                                   tol=1e-3,
                                   random_state = 1))
# Fit Model
log.fit(X_train,y_train)

# make predictions
ypred = log.predict(X_test)
# evaluate predictions
acc = accuracy_score(y_test, ypred)
print('Accuracy: %.3f' % acc)

## Logistic Model Confusion Matrix

In [None]:
plot_confusion_matrix(log, X_test, y_test)
plt.show()

## Logistic Model Classification Report

In [None]:
print(classification_report(y_test, ypred))