# Build Classification Models

In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score, confusion_matrix, classification_report, precision_recall_curve

In [1]:
#getting data
!wget https://raw.githubusercontent.com/jtracos/ML-For-Beginners/main/4-Classification/data/cleaned_cuisines.csv

--2021-08-20 23:43:19--  https://raw.githubusercontent.com/jtracos/ML-For-Beginners/main/4-Classification/data/cleaned_cuisines.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3087401 (2.9M) [text/plain]
Saving to: ‘cleaned_cuisines.csv’


2021-08-20 23:43:19 (36.5 MB/s) - ‘cleaned_cuisines.csv’ saved [3087401/3087401]



In [36]:
df = pd.read_csv("cleaned_cuisines.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,asparagus,avocado,bacon,baked_potato,balm,banana,barley,bartlett_pear,basil,bay,bean,beech,beef,beef_broth,beef_liver,beer,beet,bell_pepper,bergamot,berry,bitter_orange,black_bean,black_currant,black_mustard_seed_oil,black_pepper,black_raspberry,black_sesame_seed,black_tea,...,sunflower_oil,sweet_potato,swiss_cheese,tabasco_pepper,tamarind,tangerine,tarragon,tea,tequila,thai_pepper,thyme,tomato,tomato_juice,truffle,tuna,turkey,turmeric,turnip,vanilla,veal,vegetable,vegetable_oil,vinegar,violet,walnut,wasabi,watercress,watermelon,wheat,wheat_bread,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [73]:
#Some functions to preprocessing and estimate metrics easily
def preprocessing_data(df, random_state = 22):
  cuisines_labels = df.cuisine
  cuisines_features = df.drop(["Unnamed: 0", "cuisine"], axis = 1)
  cuisines_features.head()
  return train_test_split(cuisines_features, cuisines_labels, test_size = 0.3, random_state = random_state)

def predict(idx, X, y, model):
  print(f"ingredients: {X.iloc[idx][X.iloc[idx]!=0].keys()}")
  print(f"cuisine: {y.iloc[idx]}")
  print(f"prediction: {model.predict(X.iloc[idx].values.reshape(1, -1))}")

def predict_proba(idx, X, y, model):
  clases = model.classes_
  probs = model.predict_proba( X.iloc[idx].values.reshape(1, -1) )
  return pd.DataFrame({"clase": clases,
                       "prob": np.ravel(probs)})
  
def get_metrics(X,y, model):
  pred = model.predict(X)
  print(classification_report(y,pred))

In [74]:
X_train, X_test, y_train, y_test = preprocessing_data(df)

In [29]:
lr = LogisticRegression(multi_class="ovr", solver = "liblinear")

In [75]:
lr.fit(X_train, np.ravel(y_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [76]:
acc = lr.score(X_test, np.ravel(y_test))
print(f"accuracy: {acc}")

accuracy: 0.8031693077564637


Testing entries

In [77]:
predict(20, X_test, y_test, lr)

ingredients: Index(['black_pepper', 'cilantro', 'coconut', 'coriander', 'cumin', 'fish',
       'galanga', 'lemongrass', 'lime', 'soy_sauce', 'thai_pepper',
       'vegetable_oil'],
      dtype='object')
cuisine: thai
prediction: ['thai']


In [78]:
predict_proba(20, X_test, y_test, lr)

Unnamed: 0,clase,prob
0,chinese,0.000402
1,indian,0.000772
2,japanese,0.000444
3,korean,0.000129
4,thai,0.998253


In [87]:
predict(115, X_test, y_test, lr)

ingredients: Index(['chicken', 'soy_sauce'], dtype='object')
cuisine: thai
prediction: ['chinese']


In [88]:
predict_proba(115, X_test, y_test, lr)

Unnamed: 0,clase,prob
0,chinese,0.483444
1,indian,0.003606
2,japanese,0.196017
3,korean,0.043847
4,thai,0.273086


In [89]:
get_metrics(X_test, y_test, lr)

              precision    recall  f1-score   support

     chinese       0.71      0.74      0.72       216
      indian       0.92      0.87      0.89       269
    japanese       0.71      0.78      0.74       218
      korean       0.86      0.80      0.83       237
        thai       0.82      0.82      0.82       259

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.81      0.80      0.80      1199



# No vegetable_oil feature

In [35]:
df1 = pd.read_csv("cleaned_data.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,asparagus,avocado,bacon,baked_potato,balm,banana,barley,bartlett_pear,basil,bay,bean,beech,beef,beef_broth,beef_liver,beer,beet,bell_pepper,bergamot,berry,bitter_orange,black_bean,black_currant,black_mustard_seed_oil,black_pepper,black_raspberry,black_sesame_seed,black_tea,blackberry,...,sunflower_oil,sweet_potato,swiss_cheese,tabasco_pepper,tamarind,tangerine,tarragon,tea,tequila,thai_pepper,thyme,tomato,tomato_juice,truffle,tuna,turkey,turmeric,turnip,vanilla,veal,vegetable,vinegar,violet,walnut,wasabi,watercress,watermelon,wheat,wheat_bread,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini,cuisine
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,indian
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,indian
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,indian
3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,indian
4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,indian


In [90]:
X1_train, X1_test, y1_train, y1_test = preprocessing_data(df1)

In [100]:
lr1 = LogisticRegression(multi_class="ovr", solver = "liblinear")

In [101]:
lr1.fit(X1_train, np.ravel(y1_train))
acc = lr1.score(X1_test, np.ravel(y1_test))
print(f"accuracy: {acc}")

accuracy: 0.8115095913261051


In [94]:
predict(115,X1_test, y1_test, lr1)

ingredients: Index(['coconut', 'coriander', 'cumin', 'fenugreek', 'pepper', 'soy_sauce',
       'turmeric'],
      dtype='object')
cuisine: thai
prediction: ['thai']


In [97]:
predict_proba(115,X1_test, y1_test, lr1)

Unnamed: 0,clase,prob
0,chinese,0.024552
1,indian,0.034077
2,japanese,0.082623
3,korean,0.001083
4,thai,0.857665


In [98]:
get_metrics(X1_test, y1_test, lr1)

              precision    recall  f1-score   support

     chinese       0.78      0.74      0.76       216
      indian       0.92      0.88      0.90       269
    japanese       0.73      0.77      0.75       218
      korean       0.84      0.82      0.83       237
        thai       0.78      0.83      0.80       259

    accuracy                           0.81      1199
   macro avg       0.81      0.81      0.81      1199
weighted avg       0.81      0.81      0.81      1199

