In [42]:
import pandas as pd
import numpy as np
import datetime

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# What's Cooking? Data Science Project - http://kaggle.com/c/whats-cooking
Group 26: Leanne David, Matthew Gerlits, and Daniel Kale

We started with the basic setup of reading the train.json file given from the site and setting up the matrix with **ingredients** as our feature column and **cuisine** as our label. 

Because all our data from previous homeworks were formatted in .csv file, we had to use pandas' **read_json** method to parse the data instead. Pandas library has a lot of useful functions and we will be able to convert our final results into a .csv file using **to_csv** at the end.

In [2]:
# training set
df_train = pd.read_json('train.json')

feature_cols = ['ingredients']

X = df_train['ingredients']
Y = df_train['cuisine']

In order to perform the OneHotEncoding, we had to convert the list in the dataframe into a series and replace the current ingredients column with this series. 
By using one hot encoding, processing the data took about ~55 seconds.

In [3]:
print("Start: " + str(datetime.datetime.now()))
#df2['ingredients'] = pd.get_dummies(df2['ingredients'].apply(pd.Series).stack()).sum(level = 0)
dummies_df = pd.get_dummies(
  df_train.join(pd.Series(df_train['ingredients'].apply(pd.Series).stack().reset_index(1, drop=True),
                    name='ingredients1')).drop('ingredients', axis=1).rename(columns={'ingredients1': 'ingredients'}),
  columns=['ingredients'])

dummies_df

Start: 2017-11-27 16:10:54.708426


Unnamed: 0,cuisine,id,ingredients_( oz.) tomato sauce,ingredients_( oz.) tomato paste,ingredients_(10 oz.) frozen chopped spinach,"ingredients_(10 oz.) frozen chopped spinach, thawed and squeezed dry",ingredients_(14 oz.) sweetened condensed milk,ingredients_(14.5 oz.) diced tomatoes,ingredients_(15 oz.) refried beans,ingredients_1% low-fat buttermilk,...,ingredients_yukon gold potatoes,ingredients_yuzu,ingredients_yuzu juice,ingredients_za'atar,ingredients_zest,ingredients_zesty italian dressing,ingredients_zinfandel,ingredients_ziti,ingredients_zucchini,ingredients_zucchini blossoms
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,greek,10259,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,southern_us,25693,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
print("End: " + str(datetime.datetime.now()))

End: 2017-11-27 16:11:46.423204


We must truncate the data here since the program will run 

In [5]:
#print(dummies_df.iloc[:,0])
x = np.array(dummies_df.iloc[:,1:100])
y = dummies_df.iloc[:,0]

### Our testing and splitting data:

We use about 30% of our data as our testing set while the remaining 70% will be our training set. Random state is set as 42 since it seems to be the default number to use, but using any other number would not affect the results.

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Using KNN Classifier:

We create a KNN classifier using 5 random neighbors and fit it with the training data
We then run predict()

In [7]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [8]:
pred_knn = knn.predict(x_test)
print(pred_knn)

['spanish' 'mexican' 'filipino' ..., 'french' 'italian' 'indian']


In [9]:
print('Accuracy score using KNN: ' + str(accuracy_score(y_test, pred_knn)))

Accuracy score using KNN: 0.97191068079


In [45]:
df = pd.DataFrame(pred_knn)
df.columns = ['cuisine']

In [46]:
id_df = pd.DataFrame(x_test)
id_df = id_df.rename(columns = {0:'id'})
id_df = pd.DataFrame(id_df['id'])

result = pd.concat([id_df, df], axis=1)

result.to_csv('KNN_Results.csv')

# Using LOGISTIC REGRESSION Classifier:

With Logistic Regression, we first call the function and then start training the model:

In [47]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

After our model has been trained, we then start testing on the testing set:

In [48]:
pred_logistic = logistic.predict(x_test)
pred_logistic

array(['italian', 'italian', 'italian', ..., 'italian', 'italian',
       'italian'], dtype=object)

We can observe that using logistic regression is nowhere near an ideal classifier to use for our project as its **accuracy score** is about **18%**. The predictions it made gave **'Italian'** as the cuisine for all of the testing data. This is clearly incorrect due to a big variety of recipes by default.

It is also not an ideal classifier since logistic regression provides binary prediction where the outcome is either 0 or 1 and we have multiple labels (chinese, mexican, italian, etc.).

In [49]:
print('Accuracy score using LOGISTIC REGRESSION: ' + str(accuracy_score(y_test, pred_logistic)))

Accuracy score using LOGISTIC REGRESSION: 0.181401430539


We also tested the likelihood of each label:

In [50]:
log_prob_lr = logistic.predict_proba(x_test)
print(log_prob_lr)

[[ 0.00979728  0.01846345  0.04238393 ...,  0.02329513  0.04239093
   0.02258039]
 [ 0.01764762  0.02395257  0.07265878 ...,  0.01390016  0.07310995
   0.01321696]
 [ 0.0099862   0.0186304   0.04313678 ...,  0.0229559   0.04315186
   0.02223823]
 ..., 
 [ 0.01191929  0.02022553  0.05076585 ...,  0.01993457  0.05087182
   0.01920335]
 [ 0.01568699  0.02282293  0.06527711 ...,  0.01562105  0.06559847
   0.01491319]
 [ 0.00505195  0.01355642  0.02299451 ...,  0.03791645  0.02285091
   0.03753151]]


Cross validation of logistic regression:

In [51]:
accuracy_list = cross_val_score(logistic, x, y, cv=10, scoring='accuracy')
print(accuracy_list)

[ 0.18132792  0.18132792  0.18132792  0.18134062  0.18134485  0.18134909
  0.18137026  0.18135538  0.18136385  0.18137656]


In [52]:
df = pd.DataFrame(pred_logistic)
df.columns = ['cuisine']

In [53]:
id_df = pd.DataFrame(x_test)
id_df = id_df.rename(columns = {0:'id'})
id_df = pd.DataFrame(id_df['id'])

result = pd.concat([id_df, df], axis=1)

result.to_csv('LogisticRegression_Results.csv')

# Using LINEAR REGRESSION Classifier:

Cannot use Linear Regression due to not having numeric values to work with.

In [17]:
#my_linreg = LinearRegression()
#my_linreg.fit(x_train, y_train)

# Using DECISION TREE Classifier:

Training our model using Decision Tree:

In [18]:
decision = DecisionTreeClassifier()
decision.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

After training, we then use it to predict the testing data:

In [19]:
pred_decision = decision.predict(x_test)
print(pred_decision)

['british' 'mexican' 'filipino' ..., 'french' 'italian' 'indian']


In [20]:
print('Accuracy score using DECISION TREE: ' + str(accuracy_score(y_test, pred_decision)))

Accuracy score using DECISION TREE: 0.862690005682


In [21]:
df = pd.DataFrame(pred_decision)
df.columns = ['cuisine']

In [None]:
id_df = pd.DataFrame(x_test)
id_df = id_df.rename(columns = {0:'id'})
id_df = pd.DataFrame(id_df['id'])

result = pd.concat([id_df, df], axis=1)

result.to_csv('DecisionTree_Results.csv')

# Using RANDOM FOREST Classifier:

Instantiate RandomForest object and train our data set. Set **n_estimator** as 25 trees since more forests means less chance of overfitting:

In [22]:
random_forest = \
RandomForestClassifier(n_estimators = 25, bootstrap = True, random_state=2)

random_forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [23]:
pred_random = random_forest.predict(x_test)
print(pred_random)

['british' 'mexican' 'filipino' ..., 'french' 'italian' 'indian']


In [24]:
print('Accuracy score using RANDOM FOREST: ' + str(accuracy_score(y_test, pred_random)))

Accuracy score using RANDOM FOREST: 0.831059361939


In [25]:
df = pd.DataFrame(pred_random)
df.columns = ['cuisine']

In [None]:
id_df = pd.DataFrame(x_test)
id_df = id_df.rename(columns = {0:'id'})
id_df = pd.DataFrame(id_df['id'])

result = pd.concat([id_df, df], axis=1)

result.to_csv('RandomForest_Results.csv')

# Using VOTINGCLASSIFIER:

Combined top 3 classifiers (KNN, Decision Tree, and Random Forest) and use majority vote to average out the predicted probabilities.

In [26]:
voting = VotingClassifier(estimators=[('knn', knn), ('decision', decision),
                                    ('randomforest', random_forest)],
                        voting='soft', weights=[2, 1, 2])

voting.fit(x_train, y_train)

pred_voting = voting.predict(x_test)
pred_voting


array(['british', 'mexican', 'filipino', ..., 'french', 'italian', 'indian'], dtype=object)

In [27]:
print('Accuracy score using VOTING CLASSIFIER: ' + str(accuracy_score(y_test, pred_voting)))

Accuracy score using VOTING CLASSIFIER: 0.882575904984


In [28]:
df = pd.DataFrame(pred_voting)
df.columns = ['cuisine']

In [None]:
id_df = pd.DataFrame(x_test)
id_df = id_df.rename(columns = {0:'id'})
id_df = pd.DataFrame(id_df['id'])

result = pd.concat([id_df, df], axis=1)
result.to_csv('VotingClassifier_Results.csv')