In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from scipy.stats import jarque_bera
from scipy.stats import normaltest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from statsmodels.tools.eval_measures import mse, rmse
from wordcloud import WordCloud
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVC
from sklearn import tree


from IPython.display import Image


import pydotplus
from sklearn import ensemble

import warnings

%matplotlib inline
sns.set()

warnings.filterwarnings('ignore')
import time

In [2]:
#assign data frame
recipe_df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

In [3]:
#examine the data frame

print(
    recipe_df.head(),
    recipe_df.info(),
    recipe_df.describe()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 680 entries, title to turkey
dtypes: float64(679), object(1)
memory usage: 104.0+ MB
                                         title  rating  calories  protein  \
0              Lentil, Apple, and Turkey Wrap    2.500     426.0     30.0   
1  Boudin Blanc Terrine with Red Onion Confit    4.375     403.0     18.0   
2                Potato and Fennel Soup Hodge    3.750     165.0      6.0   
3             Mahi-Mahi in Tomato Olive Sauce    5.000       NaN      NaN   
4                    Spinach Noodle Casserole    3.125     547.0     20.0   

    fat  sodium  #cakeweek  #wasteless  22-minute meals  3-ingredient recipes  \
0   7.0   559.0        0.0         0.0              0.0                   0.0   
1  23.0  1439.0        0.0         0.0              0.0                   0.0   
2   7.0   165.0        0.0         0.0              0.0                   0.0   
3   NaN     NaN        0.0         0.0     

In [4]:
recipe_df2 = recipe_df

In [5]:
recipe_df2.dropna(inplace=True)

In [6]:
#Check for missing values

missing_values_ratios = (recipe_df2.isnull().sum()/recipe_df2.isnull().count())
missing_values_ratios.sort_values(ascending=False).head()

turkey            0.0
fortified wine    0.0
frittata          0.0
friendsgiving     0.0
freezer food      0.0
dtype: float64

In [7]:
recipe_df2['rating2']=recipe_df2['rating']

recipe_df.rating.describe()

count    15864.000000
mean         3.760952
std          1.285518
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

In [8]:
#check the correlation of the numerical variables with the target
np.abs(recipe_df2[recipe_df2.select_dtypes('float64').columns].iloc[:,1:].corr().loc[:,"rating2"]).sort_values(ascending=False).head(31)




rating2           1.000000
drink             0.229547
alcoholic         0.209148
house & garden    0.199738
gin               0.184992
spirit            0.135300
cocktail          0.131203
bon appétit       0.129069
bitters           0.127979
cocktail party    0.125811
harpercollins     0.100491
non-alcoholic     0.081777
rum               0.080409
peanut free       0.078845
soy free          0.078212
condiment         0.076352
roast             0.071724
bake              0.070332
tree nut free     0.069766
liqueur           0.066126
fall              0.065290
chartreuse        0.064081
weelicious        0.063155
créme de cacao    0.063103
sauté             0.061349
thanksgiving      0.060912
brandy            0.059976
fortified wine    0.059934
winter            0.058439
vermouth          0.058122
pickles           0.058081
Name: rating2, dtype: float64

In [9]:
#Make a binary version of the target
rating_binary = []

for idx, val in enumerate(recipe_df2['rating2']):
    if val >= 3.5:
        rating_binary.append(1)
    else:
        rating_binary.append(0)
        
recipe_df2['rating_binary'] = rating_binary

In [10]:
X = StandardScaler().fit_transform(recipe_df2[['drink', 'alcoholic', 'house & garden', 'gin',
                                             'spirit', 'cocktail', 'bon appétit', 'bitters', 'cocktail party',
                                             'harpercollins', 'non-alcoholic', 'rum', 'peanut free', 'soy free',
                                             'condiment', 'roast', 'bake', 'tree nut free', 'liqueur', 'fall',
                                             'chartreuse', 'weelicious', 'créme de cacao', 'sauté', 'thanksgiving',
                                             'brandy', 'fortified wine', 'winter', 'vermouth', 'pickles']])


Y = recipe_df2.rating_binary

In [11]:
n_comps = np.arange(0, 31)

In [12]:
param_grid_pca = [{'pca__n_components':n_comps}]

In [13]:
pipe_tree_pca = make_pipeline(PCA())

In [14]:
gs_pca = GridSearchCV(pipe_tree_pca, param_grid=param_grid_pca, cv=10)

#gs_pca.get_params().keys()

In [15]:
gs_pca.fit(X,Y)
print(gs_pca.best_params_)

{'pca__n_components': 29}


In [16]:
sklearn_pca = PCA(n_components=29)
X_sklearn = sklearn_pca.fit_transform(X)

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X_sklearn, Y, test_size = 0.20, random_state = 1)

In [21]:
#kernels = ['linear', 'poly', 'rbf', 'sigmoid']
degrees = [1, 2, 3, 4]

In [22]:
param_grid_svm = [{'svc__degree':degrees}]

In [23]:
pipe_tree_svm = make_pipeline(SVC(kernel = 'poly'))

In [24]:
gs_svm = GridSearchCV(pipe_tree_svm, param_grid=param_grid_svm, cv=10)

#gs_svm.get_params().keys()

In [25]:
gs_svm.fit(X_train,Y_train)
print(gs_svm.best_params_)

{'svc__degree': 2}


In [26]:
svm = SVC(kernel='poly',degree=2)
svm.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto_deprecated',
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [27]:
#Setting the predictions for analysis
Y_pred = svm.fit(X_train, Y_train).predict(X_test)

In [28]:
print(cross_val_score(svm, X_test, Y_test, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(svm, X_test, Y_test, cv=10).mean())

[0.8338558  0.82075472 0.8170347  0.829653   0.81072555 0.829653
 0.82018927 0.79179811 0.82018927 0.80757098]
The 10-fold cross validation average for the testing set  is  0.8181424396480356
