In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from joblib import dump, load

## Data to vector class

In [2]:
class DataToVec():
    def __init__(self):
        self.data = self.datacleaning()
    
    def datacleaning(self):
        df = self.readfile()
        df['ingredients'] = [' '.join(map(str, l)) for l in df['ingredients']]
        return self.preprocessing(df)

    def readfile(self):
        df = pd.read_json('data/train.json')
        return df
    
    def preprocessing(self, df):
        cv = CountVectorizer()
        X = cv.fit_transform(df['ingredients'])

        data = pd.DataFrame(X.toarray(), columns= cv.get_feature_names())
        data.drop([col for col, val in data.sum().iteritems() if int(val)<=10 or col.isnumeric()], axis=1, inplace=True)
        data['label'] = df['cuisine']
        return data

In [3]:
obj = DataToVec()

In [4]:
obj.data

Unnamed: 0,achiote,acorn,acting,active,added,adobo,agave,aged,ajwain,ale,...,yolk,yolks,york,yukon,zest,zesty,zinfandel,ziti,zucchini,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,greek
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,southern_us
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,filipino
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,irish
39770,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,italian
39771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,irish
39772,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,chinese


# Train Test Split

In [5]:
shuffled_data = obj.data

In [6]:
shuffled_data.sample(frac=1)
train_size = 0.8
test_size = 0.2

train_data, test_data = shuffled_data.sample(frac=train_size), shuffled_data.sample(frac=test_size)
X_train, y_train = train_data.loc[:, train_data.columns != 'label'], train_data['label']
X_test, y_test = test_data.loc[:, test_data.columns != 'label'], test_data['label']

# Random Forest model

In [20]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)

In [21]:
rfc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=300)

## Predictions

In [22]:
# run when using fresly train model
predictions = rfc.predict(X_test)

# Accuracy report

In [7]:
from sklearn.metrics import classification_report, accuracy_score

In [24]:
print("RF Accuracy: ",accuracy_score(y_test, predictions)) # features is 1171 df condition is 20

RF Accuracy:  0.9531112507856694


In [25]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

   brazilian       0.99      0.91      0.95       104
     british       0.94      0.89      0.91       167
cajun_creole       0.96      0.94      0.95       314
     chinese       0.94      0.98      0.96       539
    filipino       0.96      0.85      0.90       143
      french       0.94      0.91      0.93       547
       greek       0.97      0.94      0.95       241
      indian       0.96      0.98      0.97       602
       irish       0.98      0.88      0.93       118
     italian       0.94      0.99      0.96      1513
    jamaican       0.99      0.92      0.95       101
    japanese       0.97      0.93      0.95       291
      korean       0.97      0.94      0.96       161
     mexican       0.97      0.98      0.98      1287
    moroccan       0.99      0.92      0.95       155
     russian       1.00      0.86      0.92        86
 southern_us       0.93      0.96      0.94       909
     spanish       0.99    

# Logistic Regression

In [9]:
print(max_iter)

NameError: name 'max_iter' is not defined

In [10]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter = 1000)
LR.fit(X_train,y_train)

LogisticRegression(max_iter=1000)

In [11]:
pred = LR.predict(X_test)

## Accuracy Report

In [12]:
print("Logistic Regression Accuracy: ",accuracy_score(y_test, pred))

Logistic Regression Accuracy:  0.8338152105593966


In [None]:
print(classification_report(y_test, pred))

### Storing features in file

In [13]:
f = open('data/dataColumn.txt', 'w')
f.write(json.dumps(list(obj.data.columns)))
f.close()

# Storing model in file

In [14]:
dump(rfc, 'models/rfmodel.joblib')

['models/rfmodel.joblib']

# Load model

In [23]:
clf = load('models/rfmodel.joblib')

## Predictions

In [24]:
# run when using stored model
predictions = clf.predict(X_test)