In [70]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import numpy as np

In [2]:
cuisines_df = pd.read_csv("data/cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### prepare data

#### Separate out label feature

In [3]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

#### now separate out feature columns

In [4]:
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Choose model

We can choose from the following models based on reasoning,
1. multi-class logistic regression: fast training, linear model
2. multi-class neural network: accuracy, long training time
3. multi-class decision forest: accuracy, fast training time
4. one-vs-all multiclass: depends on two-class classifier
5. multi-class boosted decision tree: non-parametric, fast training time and scalabale

#### Reasoning

* Neural networks are too heavy. Given our clean, but minimal dataset, and the fact that we are running training locally via notebooks, neural networks are too heavyweight for this task.
* No two-class classifier. We do not use a two-class classifier, so that rules out one-vs-all.
* Decision tree or logistic regression could work. A decision tree might work, or logistic regression for multiclass data.
* Multiclass Boosted Decision Trees solve a different problem. The multiclass boosted decision tree is most suitable for nonparametric tasks, e.g. tasks designed to build rankings, so it is not useful for us.

### Split dataset

In [53]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

### Apply logistic regression

In [6]:
lr = LogisticRegression(multi_class='ovr',solver='liblinear')
model = lr.fit(X_train, np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

Accuracy is 0.7964970809007507


#### test prediction

In [7]:
print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')

ingredients: Index(['cabbage', 'carrot', 'cayenne', 'cucumber', 'sesame_oil', 'vegetable',
       'vinegar', 'wheat'],
      dtype='object')
cuisine: korean


#### check accuracy for this prediction

In [8]:
test= X_test.iloc[50].values.reshape(-1, 1).T
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()



Unnamed: 0,0
korean,0.826499
chinese,0.0979
japanese,0.059359
indian,0.008454
thai,0.007787


#### print classification report

In [9]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.72      0.74      0.73       246
      indian       0.92      0.87      0.90       243
    japanese       0.72      0.75      0.73       236
      korean       0.87      0.79      0.82       230
        thai       0.78      0.84      0.81       244

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199



### Linear SVC

In [20]:
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
}

#### Train Linear SVC and print classification report

In [21]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 77.4% 
              precision    recall  f1-score   support

     chinese       0.64      0.74      0.69       246
      indian       0.90      0.83      0.87       243
    japanese       0.76      0.72      0.74       236
      korean       0.85      0.74      0.79       230
        thai       0.77      0.83      0.80       244

    accuracy                           0.77      1199
   macro avg       0.78      0.77      0.78      1199
weighted avg       0.78      0.77      0.78      1199



### K-Neighbors classifier

In [54]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2796, 380), (1199, 380), (2796,), (1199,))

In [55]:
classifier = KNeighborsClassifier(3)

In [58]:
classifier.fit(X_train, y_train)

In [64]:
y_pred = classifier.predict(np.ascontiguousarray(X_test))



In [65]:
accuracy = accuracy_score(y_test, y_pred)

In [66]:
print("Accuracy (train) for %s: %0.1f%% " % ('K-Neighbours', accuracy * 100))
print(classification_report(y_test,y_pred))

Accuracy (train) for K-Neighbours: 74.0% 
              precision    recall  f1-score   support

     chinese       0.59      0.78      0.67       242
      indian       0.86      0.77      0.81       228
    japanese       0.70      0.82      0.75       252
      korean       0.90      0.63      0.74       252
        thai       0.79      0.71      0.75       225

    accuracy                           0.74      1199
   macro avg       0.77      0.74      0.74      1199
weighted avg       0.76      0.74      0.74      1199



### SVC classifier

In [67]:
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'SVC': SVC(),
}

In [68]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 77.7% 
              precision    recall  f1-score   support

     chinese       0.69      0.72      0.70       242
      indian       0.88      0.86      0.87       228
    japanese       0.76      0.73      0.75       252
      korean       0.83      0.75      0.79       252
        thai       0.75      0.84      0.79       225

    accuracy                           0.78      1199
   macro avg       0.78      0.78      0.78      1199
weighted avg       0.78      0.78      0.78      1199

Accuracy (train) for SVC: 82.7% 
              precision    recall  f1-score   support

     chinese       0.74      0.75      0.74       242
      indian       0.88      0.92      0.90       228
    japanese       0.86      0.80      0.83       252
      korean       0.88      0.79      0.83       252
        thai       0.78      0.89      0.83       225

    accuracy                           0.83      1199
   macro avg       0.83      0.83      0.83      1199
weig

### Ensemble Classifier

In [71]:
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)

}

In [72]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 77.7% 
              precision    recall  f1-score   support

     chinese       0.69      0.72      0.70       242
      indian       0.88      0.86      0.87       228
    japanese       0.76      0.73      0.75       252
      korean       0.83      0.75      0.79       252
        thai       0.75      0.84      0.79       225

    accuracy                           0.78      1199
   macro avg       0.78      0.78      0.78      1199
weighted avg       0.78      0.78      0.78      1199

Accuracy (train) for SVC: 82.7% 
              precision    recall  f1-score   support

     chinese       0.74      0.75      0.74       242
      indian       0.88      0.92      0.90       228
    japanese       0.86      0.80      0.83       252
      korean       0.88      0.79      0.83       252
        thai       0.78      0.89      0.83       225

    accuracy                           0.83      1199
   macro avg       0.83      0.83      0.83      1199
weig