***
# Sample Code
***

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import pickle
import itertools
import os
import string
import re
import math
from io import StringIO, BytesIO
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
from tqdm import tqdm

from IPython.display import display

import nltk
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
ps = PorterStemmer()

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, accuracy_score

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import Word2Vec

from copy import deepcopy
import operator

In [4]:
# pip install scikit-learn==1.0.2


In [5]:
import sklearn
print(sklearn.__version__)


1.0.2


***
# Load and Clean the Train Set
***

In [6]:
def clean_dataset(df,keep_cols,encode_cols=[]):
    """Clean the dataset
    """
    # Step 1: keep only columns to be used
    df = df[df.columns[df.columns.isin(keep_cols)]].loc[:,keep_cols]
    # Step 2: reformat bytes to strings
    for col in encode_cols:
        df[col] = [s.replace("b'","").replace("'","") for s in df[col]]
    return df

In [7]:
df = pd.read_csv("train.csv")

In [8]:
df.head(3)

Unnamed: 0,id,attributes.HappyHour,attributes.Ambience,hours.Tuesday,postal_code,attributes.AgesAllowed,attributes.GoodForDancing,attributes.OutdoorSeating,hours.Saturday,attributes.Corkage,...,attributes.AcceptsInsurance,attributes.RestaurantsDelivery,attributes.DietaryRestrictions,attributes.BusinessAcceptsBitcoin,address,attributes.GoodForKids,attributes.GoodForMeal,hours,label,review
0,0,b'True',"b""{'romantic': False, 'intimate': False, 'clas...",b'15:0-2:0',b'44107',,,b'False',b'11:30-2:0',,...,,b'False',,,b'17800 Detroit Ave',b'False',"b""{'dessert': False, 'latenight': False, 'lunc...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ...",american (traditional),"So, we stopped here on our way to the Side Que..."
1,1,,"b""{'romantic': False, 'intimate': False, 'tour...",b'11:0-21:0',b'85042',,,b'True',b'11:0-20:30',,...,,b'False',,b'False',"b'2160 E Baseline Rd, Ste 128'",b'True',"b""{'dessert': False, 'latenight': False, 'lunc...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",american (new),This is our go-to healthy spot! The food is al...
2,2,,,b'11:0-21:0',b'M4M 3G6',,,,b'11:0-21:0',,...,,,,,b'1000 Gerrard St E',,,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",mexican,Food court meal at Gerrard Square. It's been ...


In [9]:
keep_columns = ['id','label','name','state','postal_code','review','review_count','is_open','stars','attributes','hours']
encode_columns = ['name','state','postal_code']

df = clean_dataset(df,keep_columns,encode_cols=encode_columns)

In [10]:
df.head(3)

Unnamed: 0,id,label,name,state,postal_code,review,review_count,is_open,stars,attributes,hours
0,0,american (traditional),Rush Inn,OH,44107,"So, we stopped here on our way to the Side Que...",70,1,4.0,"{'WiFi': ""u'no'"", 'Caters': 'True', 'HappyHour...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ..."
1,1,american (new),GreenMix,AZ,85042,This is our go-to healthy spot! The food is al...,181,1,3.5,"{'WiFi': ""u'no'"", 'HasTV': 'False', 'GoodForMe...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
2,2,mexican,BarBurrito - Gerrard,ON,M4M 3G6,Food court meal at Gerrard Square. It's been ...,6,1,3.0,"{'RestaurantsTakeOut': 'False', 'RestaurantsRe...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."


***
# Explanatory Data Analysis
***

In [11]:
# the label distribution
labels = dict.fromkeys(set(df['label'].tolist()),0)
for l in  list(set(df['label'].tolist())):
    labels[l] = (df['label'] == l).sum()

# verify:
sum(list(labels.values())) == df.shape[0]

display(labels)

{'japanese': 1063,
 'chinese': 1696,
 'american (new)': 1399,
 'asian fusion': 362,
 'mediterranean': 728,
 'canadian (new)': 484,
 'american (traditional)': 2680,
 'italian': 2032,
 'mexican': 2217,
 'thai': 483}

In [12]:
# reviews
print(f'The minimum number of reviews is {df.review_count.min()}, the maximum is {df.review_count.max()}, the average is {df.review_count.mean()}.')

The minimum number of reviews is 5, the maximum is 5763, the average is 88.82402617163724.


***
# Train Classification Models
***

## Name as Identifier

In [13]:
def process_name(df):
    # if pre-existed, remove:
    if 'name_sep' in df:
        df.drop(columns=['name_sep'],inplace=True)
    # separate name by uppercase letter:
    names = df['name'].tolist()
    new_names = []
    for name in names:
        new_names.append(" ".join(w.lower() for w in re.findall('[A-Z][^A-Z]*', name)))
    # add a new column:
    cols = df.columns.tolist().copy()
    ix = cols.index('name') + 1
    cols.insert(ix,'name_sep')
    df['name_sep'] = new_names
    df = df.loc[:,cols]
    return df

In [14]:
def tokenize_names(df):
    # create name tokens:
    name_tokens = []
    all_name_tokens = {}
    for name in df['name_sep']:
        words = word_tokenize(name)
        name_tokens.append(words)
        for w in words:
            if w.lower() in all_name_tokens:
                all_name_tokens[w.lower()] += 1
            else:
                all_name_tokens[w.lower()] = 1
    return name_tokens,all_name_tokens

In [15]:
df = process_name(df)

In [16]:
df.head(3)

Unnamed: 0,id,label,name,name_sep,state,postal_code,review,review_count,is_open,stars,attributes,hours
0,0,american (traditional),Rush Inn,rush inn,OH,44107,"So, we stopped here on our way to the Side Que...",70,1,4.0,"{'WiFi': ""u'no'"", 'Caters': 'True', 'HappyHour...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ..."
1,1,american (new),GreenMix,green mix,AZ,85042,This is our go-to healthy spot! The food is al...,181,1,3.5,"{'WiFi': ""u'no'"", 'HasTV': 'False', 'GoodForMe...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
2,2,mexican,BarBurrito - Gerrard,bar burrito - gerrard,ON,M4M 3G6,Food court meal at Gerrard Square. It's been ...,6,1,3.0,"{'RestaurantsTakeOut': 'False', 'RestaurantsRe...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."


In [17]:
# split as train and validation
df_train = df.iloc[:-1000,:]
df_test = df.iloc[-1000:,:]

In [18]:
name_tokens,all_name_tokens = tokenize_names(df_train)

In [19]:
def find_strong_name_indicators(df,name_tokens,all_name_tokens,tol=1e-8,threshold=0.95,occurence=10):
    """Find strong indicators in business names to predict restaurant type.
    """
    labels0 = dict.fromkeys(set(df['label'].tolist()),0)
    count_by_name_tokens = {key:deepcopy(labels0) for key,_ in all_name_tokens.items()}
    freq_by_name_tokens = {key:deepcopy(labels0) for key,_ in all_name_tokens.items()}
    
    for i in df['id']:
        l = df['label'][i]
        for w in name_tokens[i]:
            count_by_name_tokens[w][l] += 1

    for t in count_by_name_tokens:
        for l in count_by_name_tokens[t]:
            freq_by_name_tokens[t][l] = count_by_name_tokens[t][l]/all_name_tokens[t]
        try:
            assert abs(sum(freq_by_name_tokens[t].values()) - 1) < tol
        except:
            print(t)

    strong_ind = {}
    for t in count_by_name_tokens:
        counts = count_by_name_tokens[t]
        freqs = freq_by_name_tokens[t]
        label_opt,max_freq = max(freqs.items(), key=operator.itemgetter(1))
        tot = sum(counts.values())
        if max_freq > threshold and tot >= occurence:
            strong_ind[t] = (tot,max_freq,label_opt)
    return strong_ind

In [20]:
def predict_with_name_indicator(df,strong_ind):
    """Predict from business name.
    """
    if 'predicted' in df:
        df.drop(columns=['predicted'],inplace=True)
    df['predicted'] = 'undetermined'
    for i in df['id']:
        words = word_tokenize(df['name_sep'][i])
        for w in words:
            if w in strong_ind:
                df['predicted'][i] = strong_ind[w][2]
    return df

In [21]:
strong_ind = find_strong_name_indicators(df_train,name_tokens,all_name_tokens)

In [22]:
df = predict_with_name_indicator(df,strong_ind)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted'][i] = strong_ind[w][2]


In [23]:
# split as train and validation
df_train = df.iloc[:-1000,:]
df_test = df.iloc[-1000:,:]

In [24]:
# in train set
correct_train = sum(df_train.loc[df_train['predicted'] != 'undetermined']['predicted'] == df_train.loc[df['predicted'] != 'undetermined']['label'])
total_identified_train = len(df_train.loc[df_train['predicted'] != 'undetermined']['predicted'])
accuracy_train = correct_train/total_identified_train
print(f'Using strong name indicators, in the train set, {total_identified_train} restaurants are identified, among which {correct_train} predictions are correct, the accuracy is {accuracy_train}')

Using strong name indicators, in the train set, 3962 restaurants are identified, among which 3931 predictions are correct, the accuracy is 0.9921756688541141


In [25]:
# in train set
correct_test = sum(df_test.loc[df_test['predicted'] != 'undetermined']['predicted'] == df_test.loc[df['predicted'] != 'undetermined']['label'])
total_identified_test = len(df_test.loc[df_test['predicted'] != 'undetermined']['predicted'])
accuracy_test = correct_test/total_identified_test
print(f'Using strong name indicators, in the test set, {total_identified_test} restaurants are identified, among which {correct_test} predictions are correct, the accuracy is {accuracy_test}')

Using strong name indicators, in the test set, 326 restaurants are identified, among which 320 predictions are correct, the accuracy is 0.9815950920245399


In [26]:
# overall
correct = sum(df.loc[df['predicted'] != 'undetermined']['predicted'] == df.loc[df['predicted'] != 'undetermined']['label'])
total_identified = len(df.loc[df['predicted'] != 'undetermined']['predicted'])
accuracy = correct/total_identified
print(f'Using strong name indicators, in entire sample, {total_identified} restaurants are identified, among which {correct} predictions are correct, the accuracy is {accuracy}')

Using strong name indicators, in entire sample, 4288 restaurants are identified, among which 4251 predictions are correct, the accuracy is 0.9913712686567164


***
# Review Text Mining as Identifier
***

In [27]:
df_undeter = df.loc[df['predicted']=='undetermined']
df_undeter_train = df_train.loc[df_train['predicted']=='undetermined']
df_undeter_valid = df_test.loc[df_test['predicted']=='undetermined']

In [28]:
reviews_undeter = df_undeter.review
reviews_train = df_undeter_train.review
reviews_valid = df_undeter_valid.review

In [29]:
def preprocess_reviews(doc,stemming=True,by_sentence= False):
    """Pre-process reviews.
    """
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sentence in sentences:
        words = word_tokenize(sentence) # get words  i
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words if word.lower() not in stop]
        else:
            words = [word for word in words if word.lower() not in stop]
        if by_sentence:
            tokens.append(words)
        else:
            tokens += words
    if by_sentence:
        return  [[w.lower() for w in t] for t in tokens]
    else:
        return [w.lower() for w in tokens]

In [30]:
print(df_train.review[:3].apply(preprocess_reviews))

0    [,, stop, way, side, quest, ,, street, ., \n\n...
1    [go-to, healthi, spot, !, food, alway, fresh, ...
2    [food, court, meal, gerrard, squar, ., 's, sin...
Name: review, dtype: object


In [31]:
count = CountVectorizer(stop_words=stop)
bag = count.fit_transform(df_train.review)
vocab = count.vocabulary_

In [32]:
tfidf = TfidfVectorizer(vocabulary=vocab, norm='l2', smooth_idf=True)

In [33]:
X_train = tfidf.fit_transform(reviews_train)
X_valid = tfidf.fit_transform(reviews_valid)
y_train = df_undeter_train.label
y_valid = df_undeter_valid.label

In [34]:
assert X_train.shape[0] == len(y_train)
assert X_valid.shape[0] == len(y_valid)

In [35]:
c_list = [0.001,0.01,0.1,1,10,100,1000]
res_collection = {'lbfgs':{},'liblinear':{}}

for solver in list(res_collection.keys()):
    print("=============================")
    res_collection[solver] = {}
    for c in c_list:
        res_collection[solver][str(c)] = {}
        print(f'Solver: {solver}, C: {c}:')
        mod = LogisticRegression(max_iter=10000,random_state=42,solver=solver,penalty='l2',C=c,class_weight="balanced")
        mod.fit(X_train, y_train)
        pred_valid = mod.predict(X_valid)
        pred_prob_valid = mod.predict_proba(X_valid)
        res_collection[solver][str(c)]['accuracy'] = accuracy_score(y_valid,pred_valid)
        res_collection[solver][str(c)]['auroc'] = roc_auc_score(y_valid,pred_prob_valid,multi_class='ovr')
        res_collection[solver][str(c)]['macroF1'] = f1_score(y_valid,pred_valid,average='macro')
        res_collection[solver][str(c)]['microF1'] = f1_score(y_valid,pred_valid,average='micro')


Solver: lbfgs, C: 0.001:
Solver: lbfgs, C: 0.01:
Solver: lbfgs, C: 0.1:
Solver: lbfgs, C: 1:
Solver: lbfgs, C: 10:
Solver: lbfgs, C: 100:
Solver: lbfgs, C: 1000:
Solver: liblinear, C: 0.001:
Solver: liblinear, C: 0.01:
Solver: liblinear, C: 0.1:
Solver: liblinear, C: 1:
Solver: liblinear, C: 10:
Solver: liblinear, C: 100:
Solver: liblinear, C: 1000:


In [36]:
solvers = ['lbfgs','liblinear']
c_list = [0.001,0.01,0.1,1,10,100,1000]

best_grid_word2vec = {'accuracy':None,'auroc':None,'macro_f1':None,'micro_f1':None}
accuracy_star = -np.inf
auroc_star = -np.inf
macro_f1_star = -np.inf
micro_f1_star = -np.inf

for solver in solvers:
    for c in c_list:
        accuracy = res_collection[solver][str(c)]['accuracy']
        auroc = res_collection[solver][str(c)]['auroc']
        macro_f1 = res_collection[solver][str(c)]['macroF1']
        micro_f1 = res_collection[solver][str(c)]['microF1']
        if accuracy > accuracy_star:
            best_grid_word2vec['accuracy'] = solver + "+" + str(c)
            accuracy_star = accuracy
        if auroc > auroc_star:
            best_grid_word2vec['auroc'] = solver + "+" + str(c)
            auroc_star = auroc
        if macro_f1 > macro_f1_star:
            best_grid_word2vec['macro_f1'] = solver + "+" + str(c)
            macro_f1_star = macro_f1
        if micro_f1 > micro_f1_star:
            best_grid_word2vec['micro_f1'] = solver + "+" + str(c)
            micro_f1_star = micro_f1

print(best_grid_word2vec)

{'accuracy': 'liblinear+1', 'auroc': 'liblinear+1', 'macro_f1': 'lbfgs+1', 'micro_f1': 'liblinear+1'}


In [37]:
res_collection['liblinear']['1']

{'accuracy': 0.7091988130563798,
 'auroc': 0.9462834723220401,
 'macroF1': 0.7190222685790466,
 'microF1': 0.7091988130563798}

In [38]:
mod_opt = LogisticRegression(max_iter=10000,random_state=42,solver='liblinear',penalty='l2',C=1,class_weight="balanced")
mod_opt.fit(X_train, y_train)
optpred_train = mod_opt.predict(X_train)
optpred_valid = mod.predict(X_valid)

In [39]:
df_final = df.copy()

In [40]:
df_undeter_train.reset_index(inplace=True,drop=True)
df_undeter_valid.reset_index(inplace=True,drop=True)

In [41]:
for i in range(df_undeter_train.shape[0]):
    ID = df_undeter_train.id[i]
    assert df_final.predicted[ID] == "undetermined"
    df_final.predicted[ID] = optpred_train[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.predicted[ID] = optpred_train[i]


In [42]:
for i in range(df_undeter_valid.shape[0]):
    ID = df_undeter_valid.id[i]
    assert df_final.predicted[ID] == "undetermined"
    df_final.predicted[ID] = optpred_valid[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.predicted[ID] = optpred_valid[i]


In [43]:
final_acc_all = sum(df_final.predicted == df_final.label)/df_final.shape[0]
final_acc_train = sum(df_final.predicted[:-1000] == df_final.label[:-1000])/(df_final.shape[0] - 1000)
final_acc_valid = sum(df_final.predicted[-1000:] == df_final.label[-1000:])/1000

print(f'The entire sample has an accuracy of {final_acc_all}, the train part is {final_acc_train}, the validation part is {final_acc_valid}')

The entire sample has an accuracy of 0.8953895313451005, the train part is 0.9039031620553359, the validation part is 0.792


***
# Predict the test set

In [44]:
df_test = pd.read_csv("test.csv")

In [45]:
keep_columns_test = ['id','name','state','postal_code','review','review_count','is_open','stars','attributes','hours']

df_test = clean_dataset(df_test,keep_columns_test,encode_cols=encode_columns)

In [46]:
df_test.head(3)

Unnamed: 0,id,name,state,postal_code,review,review_count,is_open,stars,attributes,hours
0,0,Fuse,ON,M5A 1T1,Overall wonderful experience. \n\nThe owner Fa...,7,0,4.5,"{'RestaurantsReservations': 'True', 'Restauran...","{'Tuesday': '11:30-0:0', 'Wednesday': '11:30-0..."
1,1,"b""Vickys Fish & Chips""",ON,M6R 2N2,"VIBE: Nieghbourhood hole. TV playing sports, p...",24,1,3.0,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...","{'Monday': '12:0-21:0', 'Tuesday': '12:0-21:0'..."
2,2,Viet House,WI,53704,"Viet House is very good. The food was fresh, ...",18,0,3.5,"{'BusinessParking': ""{'garage': False, 'street...","{'Monday': '11:30-21:0', 'Wednesday': '11:30-2..."


In [47]:
df_test['name'] = [s.replace("b\"","").replace("\"","") for s in df_test.name]

In [48]:
df_test

Unnamed: 0,id,name,state,postal_code,review,review_count,is_open,stars,attributes,hours
0,0,Fuse,ON,M5A 1T1,Overall wonderful experience. \n\nThe owner Fa...,7,0,4.5,"{'RestaurantsReservations': 'True', 'Restauran...","{'Tuesday': '11:30-0:0', 'Wednesday': '11:30-0..."
1,1,Vickys Fish & Chips,ON,M6R 2N2,"VIBE: Nieghbourhood hole. TV playing sports, p...",24,1,3.0,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...","{'Monday': '12:0-21:0', 'Tuesday': '12:0-21:0'..."
2,2,Viet House,WI,53704,"Viet House is very good. The food was fresh, ...",18,0,3.5,"{'BusinessParking': ""{'garage': False, 'street...","{'Monday': '11:30-21:0', 'Wednesday': '11:30-2..."
3,3,Thai Boat,AB,T2E 6Z3,This is a really good place. Not truly authent...,52,1,3.0,"{'RestaurantsDelivery': 'False', 'RestaurantsA...","{'Monday': '0:0-0:0', 'Tuesday': '17:0-21:30',..."
4,4,Sauce Pizza and Wine,AZ,85085,This place is great. Think fast food Italian s...,213,1,3.5,"{'BusinessAcceptsCreditCards': 'True', 'GoodFo...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
...,...,...,...,...,...,...,...,...,...,...
9995,9995,Mexquite Mexican Eatery,NC,28277,"awesome food, great price. Fresh, made to orde...",22,0,4.0,"{'NoiseLevel': ""u'average'"", 'BusinessAcceptsC...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
9996,9996,Sweet Tomatoes,AZ,85282,What a great experience!! I love this locatio...,180,1,3.5,"{'RestaurantsTakeOut': 'True', 'NoiseLevel': ""...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
9997,9997,Brewery Bar & Grill,NV,89120,I love hanging out at this bar! It may not be...,8,0,3.5,"{'RestaurantsAttire': ""u'casual'"", 'Alcohol': ...",
9998,9998,Augies Sports Grill,AZ,85338,Here is the scenario... Baseball team and a fe...,147,1,3.5,"{'Ambience': ""{'romantic': False, 'intimate': ...","{'Monday': '10:0-2:0', 'Tuesday': '10:0-2:0', ..."


In [49]:
df_test = process_name(df_test)

In [50]:
name_tokens_final,all_name_tokens_final = tokenize_names(df)

In [51]:
strong_ind_final = find_strong_name_indicators(df,name_tokens_final,all_name_tokens_final)

In [52]:
df_test = predict_with_name_indicator(df_test,strong_ind_final)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted'][i] = strong_ind[w][2]


In [53]:
df_test_undeter = df_test.loc[df_test['predicted']=='undetermined']

In [54]:
reviews_test_undeter = df_test_undeter.review

In [55]:
X_test = tfidf.fit_transform(reviews_test_undeter)

In [56]:
optpred_test = mod_opt.predict(X_test)

In [57]:
df_test_final = df_test.copy()

In [58]:
df_test_undeter.reset_index(inplace=True,drop=True)

In [59]:
for i in range(df_test_undeter.shape[0]):
    ID = df_test_undeter.id[i]
    assert df_test_final.predicted[ID] == "undetermined"
    df_test_final.predicted[ID] = optpred_test[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_final.predicted[ID] = optpred_test[i]


In [60]:
df_test_final

Unnamed: 0,id,name,name_sep,state,postal_code,review,review_count,is_open,stars,attributes,hours,predicted
0,0,Fuse,fuse,ON,M5A 1T1,Overall wonderful experience. \n\nThe owner Fa...,7,0,4.5,"{'RestaurantsReservations': 'True', 'Restauran...","{'Tuesday': '11:30-0:0', 'Wednesday': '11:30-0...",canadian (new)
1,1,Vickys Fish & Chips,vickys fish & chips,ON,M6R 2N2,"VIBE: Nieghbourhood hole. TV playing sports, p...",24,1,3.0,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...","{'Monday': '12:0-21:0', 'Tuesday': '12:0-21:0'...",thai
2,2,Viet House,viet house,WI,53704,"Viet House is very good. The food was fresh, ...",18,0,3.5,"{'BusinessParking': ""{'garage': False, 'street...","{'Monday': '11:30-21:0', 'Wednesday': '11:30-2...",thai
3,3,Thai Boat,thai boat,AB,T2E 6Z3,This is a really good place. Not truly authent...,52,1,3.0,"{'RestaurantsDelivery': 'False', 'RestaurantsA...","{'Monday': '0:0-0:0', 'Tuesday': '17:0-21:30',...",thai
4,4,Sauce Pizza and Wine,sauce pizza and wine,AZ,85085,This place is great. Think fast food Italian s...,213,1,3.5,"{'BusinessAcceptsCreditCards': 'True', 'GoodFo...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ...",italian
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,Mexquite Mexican Eatery,mexquite mexican eatery,NC,28277,"awesome food, great price. Fresh, made to orde...",22,0,4.0,"{'NoiseLevel': ""u'average'"", 'BusinessAcceptsC...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",mexican
9996,9996,Sweet Tomatoes,sweet tomatoes,AZ,85282,What a great experience!! I love this locatio...,180,1,3.5,"{'RestaurantsTakeOut': 'True', 'NoiseLevel': ""...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",american (traditional)
9997,9997,Brewery Bar & Grill,brewery bar & grill,NV,89120,I love hanging out at this bar! It may not be...,8,0,3.5,"{'RestaurantsAttire': ""u'casual'"", 'Alcohol': ...",,american (traditional)
9998,9998,Augies Sports Grill,augies sports grill,AZ,85338,Here is the scenario... Baseball team and a fe...,147,1,3.5,"{'Ambience': ""{'romantic': False, 'intimate': ...","{'Monday': '10:0-2:0', 'Tuesday': '10:0-2:0', ...",american (traditional)


In [61]:
df_out = df_test_final.loc[:,['id','predicted']]

In [62]:
df_out.columns = ['Id','Predicted']

In [63]:
df_out

Unnamed: 0,Id,Predicted
0,0,canadian (new)
1,1,thai
2,2,thai
3,3,thai
4,4,italian
...,...,...
9995,9995,mexican
9996,9996,american (traditional)
9997,9997,american (traditional)
9998,9998,american (traditional)


In [64]:
df_out.to_csv("predicted_LinlinHe.csv",index=False)