In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import *

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# training set
df_train = pd.read_json('train.json')

feature_cols = ['ingredients']

X = df_train['ingredients']
Y = df_train['cuisine']


print(X.head())
print()
print(Y.head())

0    [romaine lettuce, black olives, grape tomatoes...
1    [plain flour, ground pepper, salt, tomatoes, g...
2    [eggs, pepper, salt, mayonaise, cooking oil, g...
3                  [water, vegetable oil, wheat, salt]
4    [black pepper, shallots, cornflour, cayenne pe...
Name: ingredients, dtype: object

0          greek
1    southern_us
2       filipino
3         indian
4         indian
Name: cuisine, dtype: object


In [3]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2)

In [4]:
df_test = pd.read_json("test.json")
df_test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [5]:
print("Start: " + str(datetime.datetime.now()))
#df2['ingredients'] = pd.get_dummies(df2['ingredients'].apply(pd.Series).stack()).sum(level = 0)
dummies_df = pd.get_dummies(
  df_train.join(pd.Series(df_train['ingredients'].apply(pd.Series).stack().reset_index(1, drop=True),
                    name='ingredients1')).drop('ingredients', axis=1).rename(columns={'ingredients1': 'ingredients'}),
  columns=['ingredients'])

dummies_df

print("End: " + str(datetime.datetime.now()))
#print(df2['ingredients'])


Start: 2017-11-26 16:27:47.703317
End: 2017-11-26 16:28:52.629180


In [6]:
print(dummies_df)
#for row, col in enumerate(dummies_df):
 #   print(dummies_df.iloc[:,row](lambda x: reduce(logical_or,x), axis=1))
x = np.array(dummies_df.iloc[:,1:2000])
y = dummies_df.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
print("Done")

           cuisine     id  ingredients_(    oz.) tomato sauce  \
0            greek  10259                                   0   
0            greek  10259                                   0   
0            greek  10259                                   0   
0            greek  10259                                   0   
0            greek  10259                                   0   
0            greek  10259                                   0   
0            greek  10259                                   0   
0            greek  10259                                   0   
0            greek  10259                                   0   
1      southern_us  25693                                   0   
1      southern_us  25693                                   0   
1      southern_us  25693                                   0   
1      southern_us  25693                                   0   
1      southern_us  25693                                   0   
1      southern_us  25693

Done


# USING KNN Classifier:

In [7]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [8]:
pred_knn = knn.predict(x_test)
pred_knn
print('Accuracy score using KNN: ' + str(accuracy_score(y_test, pred_knn)))


Accuracy score using KNN: 0.901792455033


In [9]:
print(x_train)
print(x_test)

[[ 4371     0     0 ...,     0     0     0]
 [29472     0     0 ...,     0     0     0]
 [36440     0     0 ...,     0     0     0]
 ..., 
 [ 9972     0     0 ...,     0     0     0]
 [39735     0     0 ...,     0     0     0]
 [46009     0     0 ...,     0     0     0]]
[[26432     0     0 ...,     0     0     0]
 [43182     0     0 ...,     0     0     0]
 [26951     0     0 ...,     0     0     0]
 ..., 
 [31811     0     0 ...,     0     0     0]
 [39655     0     0 ...,     0     0     0]
 [ 8038     0     0 ...,     0     0     0]]


# USING LOGISTIC REGRESSION Classifier:

In [10]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
pred_logistic = logistic.predict(x_test)
pred_logistic


Accuracy score using KNN: 0.92181845069


# USING DECISION TREE Classifier:

In [None]:
print("Start: " + str(datetime.datetime.now()))

decision = DecisionTreeClassifier()
decision.fit(x_train, y_train)
print("End: " + str(datetime.datetime.now()))


Start: 2017-11-26 15:23:07.370969


In [None]:
pred_decision = decision.predict(x_test)
pred_decision

# USE RANDOM FOREST:

In [None]:
print("Start: " + str(datetime.datetime.now()))

random_forest = \
RandomForestClassifier(n_estimators = 10, bootstrap = True, random_state=2)

random_forest.fit(x_train, y_train)
print("End: " + str(datetime.datetime.now()))


In [None]:
pred_random = random_forest.predict(x_test)
pred_random

In [None]:
print('Accuracy score using KNN: ' + str(accuracy_score(y_test, pred_knn)))
print('Accuracy score using LOGISTIC REGRESSION: ' + str(accuracy_score(y_test, pred_logistic)))
print('Accuracy score using DECISION TREE: ' + str(accuracy_score(y_test, pred_decision)))
print('Accuracy score using RANDOM FOREST: ' + str(accuracy_score(y_test, pred_random)))
