In [0]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, f_classif, SelectKBest
%matplotlib inline

In [0]:
# Load the data set
df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

In [0]:
#Look how our rating is distributed. 
df['rating'].describe()

count    20052.000000
mean         3.714467
std          1.340829
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

In [48]:
# Get the unique value for each column here. Since the website did not provide us much information about this data set. We will explore the data set ourself. 

unique_col = []

for col in df.columns:
    unique_col.append(df[col].unique())
    
unique_col

[array(['Lentil, Apple, and Turkey Wrap ',
        'Boudin Blanc Terrine with Red Onion Confit ',
        'Potato and Fennel Soup Hodge ', ..., 'Turkey Cream Puff Pie ',
        'Snapper on Angel Hair with Citrus Cream ',
        'Baked Ham with Marmalade-Horseradish Glaze '], dtype=object),
 array([2.5  , 4.375, 3.75 , 5.   , 3.125, 1.875, 0.   , 1.25 ]),
 array([ 426.,  403.,  165., ..., 2006., 2417., 1986.]),
 array([3.00000e+01, 1.80000e+01, 6.00000e+00,         nan, 2.00000e+01,
        1.90000e+01, 7.00000e+00, 2.30000e+01, 4.00000e+00, 1.20000e+01,
        1.10000e+01, 5.00000e+00, 5.90000e+01, 1.00000e+01, 3.00000e+00,
        1.00000e+00, 0.00000e+00, 3.90000e+01, 4.40000e+01, 2.00000e+00,
        1.40000e+01, 8.00000e+00, 8.90000e+01, 3.80000e+01, 2.10000e+01,
        9.00000e+00, 5.40000e+01, 3.60000e+01, 4.50000e+01, 4.80000e+01,
        6.30000e+01, 9.20000e+01, 2.20000e+01, 1.50000e+01, 1.30000e+01,
        5.50000e+01, 1.70000e+01, 1.60000e+01, 1.18000e+02, 2.80000e+01,


According to the unique values for each columns. We see that except titles, rating, and all nutrition information, the rest of the data set are categorial varibles, which contain 0 and 1.

In [0]:
#Here we will convert our rating into binary categorial variable. 
#For rating greating than 4.375, we considered it as high rating, 
#and mark it as 1. For rating below that, we considered it as low rating, and mark it as 0. 

rating_cat = []

for rating in df['rating']:
    if rating >= 4.375:
        rating_cat.append(1)
    else:
        rating_cat.append(0)
        
df['rating_cat'] = rating_cat

In [0]:
#We first will try excluding all nutrition information, and fitting the model with 30% of the data set, and includes all features. 
X = df.drop(['rating', 'title', 'calories', 'protein', 'fat', 'sodium'], 1).sample(frac=0.3, replace=True, random_state=1)
Y = df['rating_cat'].sample(frac=0.3, replace=True, random_state=1)

In [51]:
svc = SVC()
svc.fit(X, Y)

print(svc.score(X, Y))



1.0


In [52]:
cross_val_score(svc, X, Y, cv=5)



array([1., 1., 1., 1., 1.])

After looking at the score and the cross validation scores, we have a perfect model here. This is possible because even with only 30% of the whole data set, we still have 600+ features, and 6000+ data points. In addition, we are using categortial features only to train a classifier, which is possible that we have a perfect model here. However, we also want to find the best 30 features of the data set, and fit the model with 30 best features. Next, we will use feature selection methods from SKLearn to select our 30 best features. 

In [65]:
selector = SelectPercentile(f_classif, percentile=5)
X_filtered_percent = selector.fit_transform(X, Y)

 234 265 269 278 279 280 283 285 288 290 294 295 302 308 319 335 336 338
 349 351 372 373 381 386 389 394 397 400 404 420 428 431 461 479 489 495
 506 536 541 571 576 585 591 599 612 614 638 643 652 653 659 666 669 670
 674] are constant.
  f = msb / msw
  f = msb / msw


In [66]:
X_filtered_percent.shape

(6016, 34)

In [67]:
svc_filtered_percent = SVC()
svc_filtered_percent.fit(X_filtered_percent, Y)

print(svc_filtered_percent.score(X_filtered_percent, Y))



1.0


In [68]:
cross_val_score(svc_filtered_percent, X_filtered_percent, Y, cv=5)



array([1., 1., 1., 1., 1.])

We used the select percentile method to select our 30 best features, and fit the model using these 30 features, we still have the perfect model here. 

In [71]:
select_k = SelectKBest(f_classif, k=30)
x_filtered_k = select_k.fit_transform(X,Y)

 234 265 269 278 279 280 283 285 288 290 294 295 302 308 319 335 336 338
 349 351 372 373 381 386 389 394 397 400 404 420 428 431 461 479 489 495
 506 536 541 571 576 585 591 599 612 614 638 643 652 653 659 666 669 670
 674] are constant.
  f = msb / msw
  f = msb / msw


In [72]:
x_filtered_k.shape

(6016, 30)

In [75]:
svc_filtered_k = SVC()
svc_filtered_k.fit(x_filtered_k, Y)

print(svc_filtered_k.score(x_filtered_k, Y))

1.0




In [76]:
cross_val_score(svc_filtered_k, x_filtered_k, Y, cv=5)



array([1., 1., 1., 1., 1.])

After using the select K best method, the 30 best features still give us a perfect model here. By looking at the data set and thinking about bias here, there is a possible bias in this data set because vistors coming to this website and rate receipts, they have a favorable receipts, or they already have a favorable receipts before coming to this website. Also, there is a possible chances that more vistors favor receipts with high proteins and less fat, then give out higher rating to receipts with high protein. 