[View in Colaboratory](https://colab.research.google.com/github/jimmyjamesarnold/Compound_Interest/blob/master/Compound_Interest.ipynb)

In [2]:
import warnings
warnings.filterwarnings('ignore')

# load pydrive
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, files
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

print('loading data...')
# load files from drive
train_import = drive.CreateFile({'id':'1KlafA7iNBDFrjFx4dA7RIEYF3_EqPEYi'})
train_import.GetContentFile('train.json')
test_import = drive.CreateFile({'id':'1SJN9ht0gaNa8OfPsOJU3PfzIFfPqKqX0'})
test_import.GetContentFile('test.json')

print('loading packages...')
# performance tracking
from __future__ import print_function
from pprint import pprint
from time import time
# Load packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# text and feature manipulation
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
# Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
# SVM Classifier
from sklearn.svm import SVC
# OvR Classifier
from sklearn.multiclass import OneVsRestClassifier

print('loading data...')
# load files from drive
# Load train and test data (in pandas)
train = pd.read_json('train.json').set_index('id')
test = pd.read_json('test.json').set_index('id')

print('engineering ingredient features...')
# save training labels
y = train.cuisine.copy()

# use TfidfVectorizer to one hot encode.
tfidf = TfidfVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')])

# next, define function to tokenize items in series
def inv_count(x):  # call series from pd
    dummies = tfidf.fit_transform(x.apply(','.join)) 
    df = pd.DataFrame(dummies.todense(),columns=tfidf.get_feature_names())
    return df

# Vectorize inventory of all ingredients
train_ingr = inv_count(train.ingredients)
test_ingr = inv_count(test.ingredients)

# clean up memory space
del train_import
del test_import

loading data...
loading packages...
loading data...
engineering ingredient features...


In [0]:
print('split ingredient data...')
# split training data for early training
X_train, X_test, y_train, y_test = train_test_split(train_ingr,y,test_size=.4,random_state=42,stratify=y)

# Set up OvR using SVC using rbf kernel
print('preparing model...')
model = SVC(coef0=1, # change to 1 from default value of 0.0
             shrinking=True, # using shrinking heuristics
             verbose=True, # print the logs 
             max_iter=-1, # no limit, let it run
             random_state=42)
# parameters to tune:
# C
# gamma
parameters = {
    'C':[1,10,100,1000],
    'gamma':[1,0.1,0.001,0.0001]
             }

print('split data for hyperparameter tuning ...')
# split super small subset for hyperparameter tuning
X_tune, X_tune2, y_tune, y_tune2 = train_test_split(X_train,y_train,test_size=.8,random_state=42,stratify=y_train)

# Set up GridSearchCV
print("Performing grid search...")
grid_search = GridSearchCV(model, param_grid=parameters, verbose=2)
grid_search.fit(X_tune, y_tune)

print("Best parameters:")
print(grid_search.best_score_)
print(grid_search.best_params_)

In [0]:
# Run classification report
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

In [0]:
# Compound Engineering

print('loading data...')
# Load foodb content and food CSVs - from other project
cmpd_import = drive.CreateFile({'id':'1Jqx15uTUd264d5T8zN11USUA2DGwoiLo'})
cmpd_import.GetContentFile('labeled_standard_contents_in_foodID_by_compoundID.csv')
cmpds = pd.read_csv('labeled_standard_contents_in_foodID_by_compoundID.csv')
# melt, drop 0s, and lowercase names
cmpds = pd.melt(cmpds, id_vars=['name'], var_name='cmpd', value_name='std_content')
cmpds = cmpds[cmpds.std_content > 0]
cmpds.cmpd = cmpds.cmpd.str.replace(' ','_')
cmpds.cmpd = cmpds.cmpd.str.replace(',','_')
cmpds.name = cmpds.name.str.lower()
# construct df of measured cmpds groupby ingredient. Ignore std_content - for now.
ing_cmpd_df = cmpds.groupby(['name'])['cmpd'].apply(','.join).reset_index()

# merge train and test sets from above for compound processing
df = pd.concat([train.drop("cuisine", axis=1), test], axis=0)

print('extracting compound features...')
# get list of ingredients
all_ingr = inv_count(df.ingredients)
pantry = all_ingr.columns.get_values()
# build dict to relate ingredients to compounds
cmpd_dict = {} 
for i in pantry:
    temp = i.replace('(','') 
    temp = temp.replace(')','')
    temp = temp.split()
    cmpd_list = []
    cmpd_list.extend([','.join(ing_cmpd_df[ing_cmpd_df.name.str.contains(j)].cmpd) for j in temp])
    cmpd_dict[i] = cmpd_list

# map recipes to cmpds using cmpd_dict - there must be a more efficient way to do this
def rec_cmpdr(df): 
    recipe_dict = defaultdict(list) # dict for mapping recipe ingredients to compounds, uses defaultdict to build from empty list
    for i, row in df.iterrows(): 
        for k in row.ingredients:
            if k in cmpd_dict: # check for ingredient, if not, leave empty
                recipe_dict[i].extend(cmpd_dict[k]) # extends list of compounds for all ingredients in recipe, stored to index
            else: recipe_dict[i]=[] 
    return recipe_dict
  
# add compounds to df's
print('engineering compound features...')
train["compounds"] = pd.Series(rec_cmpdr(train))
test["compounds"] = pd.Series(rec_cmpdr(test)) 

# Now let's take an inventory of all compounds
train_cmpd = inv_count(train.compounds) 
test_cmpd = inv_count(test.compounds) 

# clean up memory space
del cmpd_import
del cmpds
del ing_cmpd_df
del df
del all_ingr
del pantry
del cmpd_dict


In [3]:
print('join train and test cmpd and ingr sets...')
# split data back into train and test
ComboX_TRAIN = train_ingr.join(train_cmpd,lsuffix='_train_ingr', rsuffix='_train_cmpd')
# save test case for trained model
ComboX_final_test = test_ingr.join(test_cmpd,lsuffix='_test_ingr', rsuffix='_test_cmpd')

# clean up memory space
del train_cmpd
del train_ingr
del test_cmpd
del test_ingr

joining feature sets...
join train and test cmpd and ingr sets...


In [0]:
# Predictions 
print ("Predict on test data ... ")
y_pred = model.predict(X_test)

OvR_conf_matrix = confusion_matrix(y_test, y_pred)
OvR_acc_score = accuracy_score(y_test, y_pred)
print('Tuned OvR:')
print(OvR_conf_matrix)
print(OvR_acc_score*100)

# Submission
print ("Generate Submission File ... ")
test_id = [doc['id'] for doc in test]
sub = pd.DataFrame({'id': test_id, 'cuisine': y_pred}, columns=['id', 'cuisine'])
sub.to_csv('svm_output.csv', index=False)

In [0]:
# Other models for testing

# logistic Regression
from sklearn.linear_model import LogisticRegression
# Decision Trees
from sklearn.tree import DecisionTreeClassifier
# NaiveBayes
from sklearn.naive_bayes import GaussianNB
# RandomForest
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
lr = LogisticRegression(multi_class= 'ovr')
score = cross_validate(lr, X, y, return_train_score=False)
score["test_score"].mean()
lr.fit(X_TRAIN, y_train)
lr_predict = lr.predict(X_test)
# print confusion matrix and accuracy score
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print('Logistic Regression:')
print(lr_conf_matrix)
print(lr_acc_score*100)

# Decision Trees
dt = DecisionTreeClassifier()
dt.fit(X_TRAIN,y_train)
dt_predict = dt.predict(X_test)
#print confusion matrix and accuracy score
dt_conf_matrix = confusion_matrix(y_test, dt_predict)
dt_acc_score = accuracy_score(y_test, dt_predict)
print('Decision Trees:')
print(dt_conf_matrix)
print(dt_acc_score*100)

# NaiveBayes
nb = GaussianNB()
nb.fit(X_TRAIN,y_train)
nb_predict=nb.predict(X_test)
#print confusion matrix and accuracy score
nb_conf_matrix = confusion_matrix(y_test, nb_predict)
nb_acc_score = accuracy_score(y_test, nb_predict)
print('NaiveBayes:')
print(nb_conf_matrix)
print(nb_acc_score*100)

# RandomForest
rf = RandomForestClassifier()
rf.fit(X_TRAIN, y_train)
rf_predict=rf.predict(X_test)
#print confusion matrix and accuracy score
rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_acc_score = accuracy_score(y_test, rf_predict)
print('RandomForest:')
print(rf_conf_matrix)
print(rf_acc_score*100)

#first test the linear kernel first and check the accuracy
lin_svc = SVC(kernel='linear')
lin_svc.fit(X_TRAIN, y_train)
lin_svc=lin_svc.predict(X_test)
#print confusion matrix and accuracy score
lin_svc_conf_matrix = confusion_matrix(y_test, rf_predict)
lin_svc_acc_score = accuracy_score(y_test, rf_predict)
print('Linear SVC:')
print(lin_svc_conf_matrix)
print(lin_svc_acc_score*100)

#second try the rbf kernel
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X_TRAIN, y_train)
rbf_svc=rbf_svc.predict(X_test)
rbf_svc_conf_matrix = confusion_matrix(y_test, rf_predict)
rbf_svc_acc_score = accuracy_score(y_test, rf_predict)
print('rbf SVC:')
print(rbf_svc_conf_matrix)
print(rbf_svc_acc_score*100)