In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
raw_data = pd.read_csv('epicurious-recipes-with-rating-and-nutrition/epi_r.csv')
raw_data.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Turning the outcome into a binary outcome
raw_data['rating'] = np.where(raw_data['rating'] >= 4., 1, 0)
#ratings equal to or grater than 4 are classified as good recipes (1) and less than 4 are bad (0)
print((raw_data['rating'].sum())/len(raw_data))
#this way there is no class imbalance and there is about 54% good vs bad recipes.

0.5355076800319171


In [4]:
# Counting nulls 
null_count = raw_data.isnull().sum()
null_count[null_count>0]

#these columns we will drop from out dataset

calories    4117
protein     4162
fat         4183
sodium      4119
dtype: int64

In [5]:
svm = SVC(gamma='auto')

X = raw_data.drop(['rating', 'title', 'calories', 'protein', 'fat', 'sodium'], 1)
Y = raw_data['rating']

In [6]:
#will use PCA to reduce the number of components to 30 
from sklearn.decomposition import PCA 
n_components = 30
sklearn_pca = PCA(n_components=n_components)
Y_sklearn = sklearn_pca.fit_transform(X)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)


print('{} number of components explains {:.2f}% of the total variance'.format(n_components,np.sum(sklearn_pca.explained_variance_ratio_)*100))

The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [0.12310325 0.04414084 0.03883227 0.02648318 0.02280018 0.01867339
 0.01624925 0.01510961 0.01430328 0.01340298 0.01245195 0.01186377
 0.01149501 0.01031797 0.01002516 0.00932837 0.00909402 0.00888174
 0.00867691 0.00840169 0.00821749 0.0080964  0.00778358 0.00768172
 0.0075551  0.00722979 0.00712855 0.00700151 0.00683848 0.00677058]
30 number of components explains 50.79% of the total variance


In [7]:
#creating a new dataframe for the new components as features
X_new = pd.DataFrame()
for i in range(n_components):
    X_new['component {}'.format(i+1)] = Y_sklearn.T[i]

In [8]:
# Use train_test_split to create the necessary training and test groups
X_train, X_test, Y_train, Y_test = train_test_split(X_new, Y, test_size=0.3, random_state=4171)

svm.fit(X_train, Y_train)

#R^2 values on train data
print('Score on train set:')
print(svm.score(X_train, Y_train))
print('\nCross Validation on train set:')
cv_train = cross_val_score(svm, X_train, Y_train, cv=10)
print(cv_train)
print('\n{:.4} +/- {:.3} %'.format(np.mean(cv_train)*100, np.std(cv_train)*200))

Score on train set:
0.591194072385295

Cross Validation on train set:
[0.60213523 0.57935943 0.58220641 0.59158945 0.56806842 0.60370634
 0.57448325 0.58873842 0.56450463 0.56664291]

58.21 +/- 2.69 %


In [9]:
#R^2 values on test data
print('Score on test set:')
print(svm.score(X_test, Y_test))
print('\nCross Validation on test set:')
cv_test = cross_val_score(svm, X_test, Y_test, cv=10)
print(cv_test)
print('\n{:.4} +/- {:.3} %'.format(np.mean(cv_test)*100, np.std(cv_test)*200))

Score on test set:
0.5752992021276596

Cross Validation on test set:
[0.60199005 0.56644518 0.589701   0.55980066 0.56976744 0.54242928
 0.56905158 0.57737105 0.55740433 0.57737105]

57.11 +/- 3.19 %
