# Build Classification Models

In [43]:
import pandas as pd
cuisines_df = pd.read_csv("../data/cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,soy_sauce,cayenne,scallion,vegetable_oil,onion,sesame_oil,black_pepper,vinegar,...,kumquat,raw_beef,red_algae,chervil,sauerkraut,chayote,champagne_wine,catfish,brussels_sprout,liver
0,0,indian,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

In [45]:
x = cuisines_df.drop(["cuisine", "Unnamed: 0"], axis=1)
y = cuisines_df.cuisine

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=False)

In [66]:
# OVR is a algorithm to create multiclass classifier out of binary classifiers:
# we have 4 classes: 1,2,3,4. it created classifiers for 1 vs [2,3,4], 2 vs [1,3,4], 3 vs [1,2,4] and 4 vs [1,2,3]

# OVO the same but it creates combinations of all classes: 1 vs 2, 1 vs 3, 1 vs 4 ....

# multinomial doesn't split model into binary classifiers, it uses softmax regression algorithm.
# It directly generalizes logistic regression to multi-class problems without decomposing it into multiple binary classification tasks.

# Let's try different solvers:  
model = LogisticRegression(multi_class="ovr", solver="liblinear", penalty="l1")
model = model.fit(x_train, y_train)

print(f'Score {model.score(x_test, y_test)}')


Score 0.8197747183979975


In [85]:
# ravel just flatterns an array
example = x_test.iloc[30].values.ravel()

print(x_test.iloc[30][x_test.iloc[30] > 0].keys())
# print(model.coef_[0][[index for index, v in enumerate(x_test.iloc[30] > 0) if v]])
proba = model.predict_proba([example])

classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

resultdf.head()

Index(['pepper', 'sesame_seed', 'olive_oil', 'potato', 'mandarin_peel'], dtype='object')




Unnamed: 0,chinese,indian,japanese,korean,thai
0,0.066683,0.073784,0.791973,0.055377,0.012183


In [87]:
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

     chinese       0.81      0.71      0.75       177
      indian       0.88      0.87      0.88       151
    japanese       0.78      0.83      0.80       164
      korean       0.86      0.79      0.82       149
        thai       0.79      0.91      0.85       158

    accuracy                           0.82       799
   macro avg       0.82      0.82      0.82       799
weighted avg       0.82      0.82      0.82       799

