In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import LdaModel, Word2Vec, Doc2Vec
from sklearn.decomposition import PCA
from collections import defaultdict  # For word frequency
import re
import gensim.downloader as api
from sklearn.preprocessing import LabelBinarizer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import RandomOverSampler
import warnings
from sklearn.metrics import hamming_loss
from pandarallel import pandarallel
from thefuzz import process, fuzz
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MultiLabelBinarizer

pandarallel.initialize()
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


def find_root_item_type(word, word_list, limit, min_score=90):
    _words = process.extract(word, word_list, scorer=fuzz.token_sort_ratio, limit=limit)
    score_words = list(filter(lambda x: x[1] >= min_score, _words))
    score_words.sort(key=lambda x: len(x[0]) / (x[1]))
    filtered_words = [x[0] for x in score_words]
    if len(filtered_words) > 1:
        filtered_word = filtered_words[0]
    else:
        filtered_word = word

    return filtered_word


def clean_text(line):
    clean1 = re.sub(r"\b(oz|ml|(\d\w)|on|pc|combo|liter)\b|(\(.+\))|[^A-Za-z\s]", "", line)
    #    clean2=re.sub(r"", "", clean1)
    clean_stop = " ".join([word for word in clean1.split() if word not in stop_words])

    return clean_stop

raw_df = pd.read_csv('./raw.csv')

df = raw_df.copy()
df['total_eater_revenue'] = df[['total_eater_spend','total_eater_discount']].sum(axis=1)
df['total_orders_promo'] = df['first_time_orders_promo'] + df['returning_orders_promo']
df['completion_rate'] = df['completed_orders'] / df['accepted_orders']
df['acceptance_rate'] = df['accepted_orders'] / df['requested_orders']
df['avg_prep_time_min'] = df['avg_prep_time'] / 60.0
df['spend_per_prep_min'] = df['total_eater_spend'] / (df['avg_prep_time']*df['accepted_orders']*1.0)
df['total_eater_revenue'] = df[['total_eater_spend','total_eater_discount']].sum(axis=1)
# preprocess text by removing stop words
stop_words = set(stopwords.words('english'))

df['name'] = df['name'].str.lower()
df['clean_name'] = df['name'].apply(clean_text)

desc_regex=r"(\b(spicy|^classic|ultimate|signature)\b)|(french|fresh cut|home)\s(?<!fries)"
df['item_type'] = df['clean_name'].apply(lambda x: " ".join(w for w in re.sub(desc_regex, "", x).split())).tolist()
df.loc[df.item_type == '', 'item_type'] = df['clean_name']
all_items = set(df['item_type'].tolist())

# create food item dictionary
item_df = df[['item_type']].drop_duplicates()
# Fuzzy match dictionary on self to consolidate item types
item_df['item_type_new'] = item_df.parallel_apply(lambda x: find_root_item_type(x['item_type'], all_items, min_score=90, limit=6), axis=1)

df2 = df.merge(item_df, on='item_type')

cuisine_t={
    'italian':r'fettucini|rigatoni|lasagna|spaghetti|penne|gnocchi|tortellini|pasta|carbonara|pizza|calzone|garlic bread|alfredo|mozzarella|caesar|cacio|fe[t]{1,2}u[c]{1,2}ine|ravioli|burrata|proscuito|chicken parm|alfredo',
    'vietnamese': r'\b(pho)\b|spring roll|vietnamese|\b(ba[nh]{2})\b mi|thit nuong|cha gio|\bcuon\b|summer roll',
    'korean': r'bibimbap|korean|kimchi',
    'indian': r'paneer|tikka|masala|indian|pakora|gobhi|samosa|naan|basmati|lassi|saag|biryani|makhni|vindaloo|tandoori|korma|butter chicken|dolma',
    'southern': r'fried chicken|gumbo|brisket|smoke|bbq|fried zucchini|bbq|coleslaw',
    'mediterranean': r'pita|tabouleh|fattoush|gyro|kebab|kabob|skewer|falafel|greek|kofta|shawarma|hummus|tzatziki',
    'breakfast': r'orange juice|egg|breakfast|bagel|toast|bacon|omelette|croissant|hash brown|lox|waffle|pancake|sausage',
    'american': r'mac.*cheese|burger|\bwing[s]?\b|bacon|reuben|cheesesteak|tater tots|fries|buffalo|ranch|onion rings|grilled cheese|melt|nashville|slider|chili cheese|garlic knots|tender|nuggets|dog',
    'chinese': r'orange chicken|tofu|chinese|mein|dumplings|mongolian|potsticker|fried rice|general tsos|wontons|chow fun|szechuan|beef broccoli|kung pao|\b(beef broccoli)\b',
    'japanese': r'ramen|sushi|sashimi|nigiri|unagi|katsu|((?<!egg)(?<!spring)(?<!lobster)(?<!lamb)(?<!curry)(?<!cinnamon)\sroll)|gyoza|tempura|miso|edamame|udon|wasabi|karaage|teriyaki|\bsoba\b',
    'latin': r'mexican|taco|burrito|guac|chorizo|al pastor|quesadilla|salsa|birria|horchata|carne asada|el verde|refried beans|tostada|nachos|churro|arepa|empanada|tortillas|jerk|caribbean',
    'thai': r'panang|pad thai|pad see ew|\bthai\b|drunken noodle|((red|yellow|green)\scurry)|tom kha|massaman|satay',
    'sandwiches': r'sandwich|blt|turkey club|roast beef',
    'soup': r'(soup)',
    'coffee': r'latte|capuccino|coffee|cappucino|cold brew',
    'drinks': r'water|coke|sprite|ginger ale|lemonade|pepsi|juice|\b(tea)\b|gatorade',
    'hawaiian': r'hawaiian|poke|musubi',
    'healthy': r'salad|juice|healthy|fruit|acai|berry|vegan|vegetables|veggies|smoothie',
    'sweets': r'waffle|ice cream|tiramisu|oreo|cinnamon roll|cheesecake|smoothie|donuts|chocolate|cookie|caramel|pudding',
    'seafood': r'fish|lobster|crab|shrimp',
    'rice': r'rice bowl|white rice',
}

cuisine_list = list(cuisine_t.keys())
for k,v in cuisine_t.items():
    df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1

df2[cuisine_list] = df2[cuisine_list].fillna(0)
df2['sum'] = df2[cuisine_list].sum(axis=1)
df2.loc[df2['sum'] == 0].shape

df2['tags'] = df2[cuisine_list].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)
df2.loc[df2.item_type_new.str.contains('wing')][['clean_name','item_type','item_type_new','tags']]

valid_df = df2.loc[df2['sum'] >= 1]
unlabel_df = df2.loc[df2['sum'] == 0].reset_index(drop=True)
unlabel_df['tags'] = unlabel_df[list(cuisine_t.keys())].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)

xgb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_features=200)),
    ('clf', OneVsRestClassifier(GradientBoostingClassifier()))
])

mlb = MultiLabelBinarizer(classes=cuisine_list)
mlb.fit(cuisine_list)

X = valid_df['item_type_new']
y = mlb.transform(valid_df['tags'])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=1)

model_list = {'XGB': xgb_pipe}  # ,'Logistic': logi_pipe,'Naive Bayes': nb_pipe}

for k, v in model_list.items():
    print(f"Fitting {k} Model to data")
    v.fit(x_train, y_train)

    test_predict = v.predict(x_test)
    train_predict = v.predict(x_train)
    print(
        f'{k} F1 for train for is {classification_report(y_train, train_predict, target_names=list(cuisine_t.keys()))}')
    print(f'{k} F1 for test for is {classification_report(y_test, test_predict, target_names=list(cuisine_t.keys()))}')
    print(f'{k} Hamming Loss is {hamming_loss(y_test, test_predict)}')

ux = unlabel_df['item_type_new']
uy = mlb.transform(unlabel_df['tags'])

y_pred = xgb_pipe.predict(ux)
y_pred_tags = mlb.inverse_transform(y_pred)

y_tags = pd.Series(map(list, y_pred_tags), name='pred_tags')
pred_df = unlabel_df.merge(y_tags, left_index=True, right_index=True)

for i in cuisine_list:
    pred_df[i] = pred_df.apply(lambda x: 1 if i in x['pred_tags'] else 0, axis = 1)

pred_df['sum'] = pred_df[cuisine_list].sum(axis=1)

final_df = pd.concat([valid_df, unlabel_df])
final_pred_df = pd.concat([valid_df, pred_df])
if (final_df.shape[0] == raw_df.shape[0]) & (final_pred_df.shape[0] == raw_df.shape[0]):
    final_df.to_csv('data.csv', header=True, index=False)
    final_pred_df.to_csv('pred_data.csv', header=True, index=False)
else:
    raise ValueError('Rows didnt match')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.c

Fitting XGB Model to data


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


XGB F1 for train for is                precision    recall  f1-score   support

      italian       1.00      0.93      0.96      5445
   vietnamese       1.00      0.96      0.98       663
       korean       1.00      0.10      0.18       139
       indian       0.99      0.86      0.92      2106
     southern       1.00      0.95      0.97      2684
mediterranean       1.00      0.93      0.96      2400
    breakfast       1.00      0.96      0.98     11412
     american       1.00      0.96      0.98     15640
      chinese       1.00      0.86      0.92      1922
     japanese       0.99      0.89      0.94      3111
        latin       1.00      0.90      0.95      7049
         thai       0.97      0.96      0.96      2014
   sandwiches       1.00      1.00      1.00      2618
         soup       1.00      1.00      1.00       418
       coffee       1.00      0.93      0.96       656
       drinks       1.00      1.00      1.00      3410
     hawaiian       1.00      0.56      

NameError: name 'ux' is not defined