In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import warnings
from sklearn.metrics import hamming_loss
from pandarallel import pandarallel
from thefuzz import process, fuzz
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MultiLabelBinarizer

pandarallel.initialize()
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


def find_root_item_type(word, word_list, limit, min_score=90):
    _words = process.extract(word, word_list, scorer=fuzz.token_sort_ratio, limit=limit)
    score_words = list(filter(lambda x: x[1] >= min_score, _words))
    score_words.sort(key=lambda x: len(x[0]) / (x[1]))
    filtered_words = [x[0] for x in score_words]
    if len(filtered_words) > 1:
        filtered_word = filtered_words[0]
    else:
        filtered_word = word

    return filtered_word

def clean_text(line):
    clean1 = re.sub(r"\b(oz|ml|(\d\w)|on|pc|combo|liter)\b|(\(.+\))|[^A-Za-z\s]", "", line)
    #    clean2=re.sub(r"", "", clean1)
    clean_stop = " ".join([word for word in clean1.split() if word not in stop_words])

    return clean_stop

raw_df = pd.read_csv('./raw.csv')

df = raw_df.copy()
df['total_eater_revenue'] = df[['total_eater_spend','total_eater_discount']].sum(axis=1)
df['total_orders_promo'] = df['first_time_orders_promo'] + df['returning_orders_promo']
df['completion_rate'] = df['completed_orders'] / df['accepted_orders']
df['acceptance_rate'] = df['accepted_orders'] / df['requested_orders']
df['order_issue_rate'] = df['order_issues'] / df['completed_orders']
df['first_time_order_organic'] = df['first_time_orders'] - df['first_time_orders_promo']
df['returning_order_organic'] = df['returning_orders'] - df['returning_orders_promo']
df['first_time_order_rate'] = df['first_time_orders'] / df['accepted_orders']
df['returning_order_rate'] = df['returning_orders'] / df['accepted_orders']
df['avg_prep_time_min'] = df['avg_prep_time'] / 60.0
df['spend_per_prep_min'] = df['total_eater_spend'] / (df['avg_prep_time']*df['accepted_orders']*1.0)
df['total_eater_revenue'] = df[['total_eater_spend','total_eater_discount']].sum(axis=1)
df['date_str'] = df['date'].copy()
df['date'] = pd.to_datetime(df['date'])
df['dayofweek'] = df['date'].dt.dayofweek
df['dayname'] = df['date'].dt.day_name()

# preprocess text by removing stop words
# Decided against stemming due to poor performance
stop_words = set(stopwords.words('english'))

df['name'] = df['name'].str.lower()
df['clean_name'] = df['name'].apply(clean_text)

desc_regex= r"(\b(spicy|^classic|ultimate|signature)\b)|(french|fresh cut|home)\s(?<!fries)"
df['item_type'] = df['clean_name'].apply(lambda x: " ".join(w for w in re.sub(desc_regex, "", x).split())).tolist()
df.loc[df.item_type == '', 'item_type'] = df['clean_name']
all_items = set(df['item_type'].tolist())

# create food item dictionary
item_df = df[['item_type']].drop_duplicates()
# Fuzzy match dictionary on self to consolidate item types
item_df['item_type_new'] = item_df.parallel_apply(lambda x: find_root_item_type(x['item_type'], all_items, min_score=90, limit=6), axis=1)

df2 = df.merge(item_df, on='item_type')

cuisine_t={
    'italian':r'fettucini|rigatoni|lasagna|spaghetti|penne|gnocchi|tortellini|pasta|carbonara|pizza|calzone|garlic bread|alfredo|mozzarella|caesar|cacio|fe[t]{1,2}u[c]{1,2}ine|ravioli|burrata|proscuito|chicken parm|alfredo',
    'vietnamese': r'\b(pho)\b|spring roll|vietnamese|\b(ba[nh]{2})\b mi|thit nuong|cha gio|\bcuon\b|summer roll',
    'korean': r'bibimbap|korean|kimchi',
    'indian': r'paneer|tikka|masala|indian|pakora|gobhi|samosa|naan|basmati|lassi|saag|biryani|makhni|vindaloo|tandoori|korma|butter chicken|dolma',
    'southern': r'fried chicken|gumbo|brisket|smoke|bbq|fried zucchini|bbq|coleslaw',
    'mediterranean': r'pita|tabouleh|fattoush|gyro|kebab|kabob|skewer|falafel|greek|kofta|shawarma|hummus|tzatziki',
    'breakfast': r'orange juice|egg|breakfast|bagel|toast|bacon|omelette|croissant|hash brown|lox|waffle|pancake|sausage',
    'american': r'mac.*cheese|burger|\bwing[s]?\b|bacon|reuben|cheesesteak|tater tots|fries|buffalo|ranch|onion rings|grilled cheese|melt|nashville|slider|chili cheese|garlic knots|tender|nuggets|dog',
    'chinese': r'orange chicken|tofu|chinese|mein|dumplings|mongolian|potsticker|fried rice|general tsos|wontons|chow fun|szechuan|beef broccoli|kung pao|\b(beef broccoli)\b',
    'japanese': r'ramen|sushi|sashimi|nigiri|unagi|katsu|((?<!egg)(?<!spring)(?<!lobster)(?<!lamb)(?<!curry)(?<!cinnamon)\sroll)|gyoza|tempura|miso|edamame|udon|wasabi|karaage|teriyaki|\bsoba\b',
    'latin': r'mexican|taco|burrito|guac|chorizo|al pastor|quesadilla|salsa|birria|horchata|carne asada|el verde|refried beans|tostada|nachos|churro|arepa|empanada|tortillas|jerk|caribbean',
    'thai': r'panang|pad thai|pad see ew|\bthai\b|drunken noodle|((red|yellow|green)\scurry)|tom kha|massaman|satay',
    'sandwiches': r'sandwich|blt|turkey club|roast beef',
    'soup': r'(soup)',
    'coffee': r'latte|capuccino|coffee|cappucino|cold brew',
    'drinks': r'water|coke|sprite|ginger ale|lemonade|pepsi|juice|\b(tea)\b|gatorade',
    'hawaiian': r'hawaiian|poke|musubi',
    'healthy': r'salad|juice|healthy|fruit|acai|berry|vegan|vegetables|veggies|smoothie',
    'sweets': r'waffle|ice cream|tiramisu|oreo|cinnamon roll|cheesecake|smoothie|donuts|chocolate|cookie|caramel|pudding',
    'seafood': r'fish|lobster|crab|shrimp',
    'rice': r'rice bowl|white rice',
}

cuisine_list = list(cuisine_t.keys())
for k,v in cuisine_t.items():
    df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1

df2[cuisine_list] = df2[cuisine_list].fillna(0)
df2['sum'] = df2[cuisine_list].sum(axis=1)
df2.loc[df2['sum'] == 0].shape

df2['tags'] = df2[cuisine_list].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)
df2.loc[df2.item_type_new.str.contains('wing')][['clean_name','item_type','item_type_new','tags']]

valid_df = df2.loc[df2['sum'] >= 1]
unlabel_df = df2.loc[df2['sum'] == 0].reset_index(drop=True)
unlabel_df['tags'] = unlabel_df[list(cuisine_t.keys())].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)

xgb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_features=200)),
    ('clf', OneVsRestClassifier(GradientBoostingClassifier()))
])

mlb = MultiLabelBinarizer(classes=cuisine_list)
mlb.fit(cuisine_list)

X = valid_df['item_type_new']
y = mlb.transform(valid_df['tags'])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=1)

model_list = {'XGB': xgb_pipe}  # ,'Logistic': logi_pipe,'Naive Bayes': nb_pipe}

for k, v in model_list.items():
    print(f"Fitting {k} Model to data")
    v.fit(x_train, y_train)

    test_predict = v.predict(x_test)
    train_predict = v.predict(x_train)
    print(
        f'{k} F1 for train for is {classification_report(y_train, train_predict, target_names=list(cuisine_t.keys()))}')
    print(f'{k} F1 for test for is {classification_report(y_test, test_predict, target_names=list(cuisine_t.keys()))}')
    print(f'{k} Hamming Loss is {hamming_loss(y_test, test_predict)}')

ux = unlabel_df['item_type_new']
uy = mlb.transform(unlabel_df['tags'])

y_pred = xgb_pipe.predict(ux)
y_pred_tags = mlb.inverse_transform(y_pred)

y_tags = pd.Series(map(list, y_pred_tags), name='pred_tags')
pred_df = unlabel_df.merge(y_tags, left_index=True, right_index=True)

for i in cuisine_list:
    pred_df[i] = pred_df.apply(lambda x: 1 if i in x['pred_tags'] else 0, axis = 1)

pred_df['sum'] = pred_df[cuisine_list].sum(axis=1)

final_df = pd.concat([valid_df, unlabel_df])
final_pred_df = pd.concat([valid_df, pred_df])
if (final_df.shape[0] == raw_df.shape[0]) & (final_pred_df.shape[0] == raw_df.shape[0]):
    final_df.to_csv('data.csv', header=True, index=False)
    final_pred_df.to_csv('pred_data.csv', header=True, index=False)
else:
    raise ValueError('Rows didnt match')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/justintran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.c

Fitting XGB Model to data


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


XGB F1 for train for is                precision    recall  f1-score   support

      italian       1.00      0.93      0.96      5406
   vietnamese       0.99      0.85      0.92       658
       korean       0.33      0.52      0.41       131
       indian       0.99      0.86      0.92      2105
     southern       1.00      0.95      0.97      2658
mediterranean       1.00      0.92      0.96      2421
    breakfast       1.00      0.97      0.98     11428
     american       1.00      0.96      0.98     15663
      chinese       1.00      0.85      0.92      1893
     japanese       0.98      0.90      0.94      3105
        latin       1.00      0.90      0.95      7081
         thai       0.97      0.96      0.96      2011
   sandwiches       1.00      1.00      1.00      2567
         soup       1.00      1.00      1.00       421
       coffee       1.00      0.94      0.97       638
       drinks       1.00      1.00      1.00      3412
     hawaiian       1.00      0.57      

In [401]:
raw_df = pd.read_csv('./raw.csv')

In [402]:
df = raw_df.copy()
df['total_eater_revenue'] = df[['total_eater_spend','total_eater_discount']].sum(axis=1)
df['total_orders_promo'] = df['first_time_orders_promo'] + df['returning_orders_promo']
df['completion_rate'] = df['completed_orders'] / df['accepted_orders']
df['acceptance_rate'] = df['accepted_orders'] / df['requested_orders']
df['order_issue_rate'] = df['order_issues'] / df['completed_orders']
df['first_time_order_rate'] = df['first_time_orders'] / df['accepted_orders']
df['returning_order_rate'] = df['returning_orders'] / df['accepted_orders']
df['avg_prep_time_min'] = df['avg_prep_time'] / 60.0
df['spend_per_prep_min'] = df['total_eater_spend'] / (df['avg_prep_time']*df['accepted_orders']*1.0)
df['total_eater_revenue'] = df[['total_eater_spend','total_eater_discount']].sum(axis=1)
df['date_str'] = df['date'].copy()
df['date'] = pd.to_datetime(df['date'])
df['dayofweek'] = df['date'].dt.dayofweek
df['dayname'] = df['date'].dt.day_name()

# preprocess text by removing stop words, lemmatizing, and stemming
stop_words = set(stopwords.words('english'))
df['name'] = df['name'].str.lower()
df['clean_name'] = df['name'].apply(clean_text)
# Clean up words that do not help enrich identification of item
desc_regex=r"(\b(spicy|^classic|ultimate|signature)\b)|(french|fresh cut|home)\s(?<!fries)"
df['item_type'] = df['clean_name'].apply(lambda x: " ".join(w for w in re.sub(desc_regex, "", x).split())).tolist()
df.loc[df.item_type == '', 'item_type'] = df['clean_name']
all_items = set(df['item_type'].tolist())

# create food item dictionary
item_df = df[['item_type']].drop_duplicates()
# Fuzzy match dictionary on self to consolidate item types
item_df['item_type_new'] = item_df.parallel_apply(lambda x: find_root_item_type(x['item_type'], all_items, min_score=90, limit=6), axis=1)

df = df.merge(item_df, on='item_type')
#df.loc[df.name.str.contains('classic')].groupby(['name','clean_name','item_type','item_type_new'], as_index=False, group_keys=False)['accepted_orders'].sum().sort_values(by='accepted_orders', ascending=False).head(30)



In [403]:
# item_df.loc[item_df['item_type_new'] != item_df['item_type']].tail(50)
# item_df.loc[item_df['item_type'].str.contains('basmnati')]

In [404]:
df2 = df.copy()

cuisine_t={
    'italian':r'fettucini|rigatoni|lasagna|spaghetti|penne|gnocchi|tortellini|pasta|carbonara|pizza|calzone|garlic bread|alfredo|mozzarella|caesar|cacio|fe[t]{1,2}u[c]{1,2}ine|ravioli|burrata|proscuito|chicken parm|alfredo',
    'vietnamese': r'\b(pho)\b|spring roll|vietnamese|\b(ba[nh]{2})\b mi|thit nuong|cha gio|\bcuon\b|summer roll',
    'korean': r'bibimbap|korean|kimchi',
    'indian': r'paneer|tikka|masala|indian|pakora|gobhi|samosa|naan|basmati|lassi|saag|biryani|makhni|vindaloo|tandoori|korma|butter chicken|dolma',
    'southern': r'fried chicken|gumbo|brisket|smoke|bbq|fried zucchini|bbq|coleslaw',
    'mediterranean': r'pita|tabouleh|fattoush|gyro|kebab|kabob|skewer|falafel|greek|kofta|shawarma|hummus|tzatziki',
    'breakfast': r'egg|breakfast|bagel|toast|bacon|omelette|hash brown|croissant|lox|waffle|pancake|sausage',
    'american': r'mac.*cheese|burger|\bwing[s]?\b|reuben|cheesesteak|tater tots|fries|buffalo|ranch|onion rings|grilled cheese|melt|nashville|slider|chili cheese|garlic knots|tender|nuggets|dog',
    'chinese': r'orange chicken|tofu|chinese|mein|dumplings|mongolian|potsticker|fried rice|general tsos|wontons|chow fun|szechuan|beef broccoli|kung pao|\b(beef broccoli)\b',
    'japanese': r'ramen|sushi|sashimi|nigiri|unagi|katsu|((?<!egg)(?<!spring)(?<!lobster)(?<!lamb)(?<!curry)(?<!cinnamon)\sroll)|gyoza|tempura|miso|edamame|udon|wasabi|karaage|teriyaki|\bsoba\b',
    'mexican': r'mexican|taco|burrito|guac|chorizo|al pastor|quesadilla|salsa|birria|horchata|carne asada|el verde|refried beans|tostada|nachos|churro|tortillas',
    'latin': r'\barepa\b|empanada|jerk|caribbean',
    'thai': r'panang|pad thai|pad see ew|\bthai\b|drunken noodle|((red|yellow|green)\scurry)|tom kha|massaman|satay',
    'sandwiches': r'sandwich|blt|turkey club|roast beef',
    'soup': r'(soup)',
    'coffee': r'latte|capuccino|coffee|cappucino|cold brew',
    'drinks': r'water|coke|sprite|ginger ale|lemonade|pepsi|juice|\b(tea)\b|gatorade',
    'hawaiian': r'hawaiian|poke|musubi',
    'healthy': r'salad|juice|healthy|fruit|acai|berry|vegan|vegetables|veggies|smoothie',
    'sweets': r'waffle|ice cream|tiramisu|oreo|cinnamon roll|cheesecake|smoothie|donuts|chocolate|cookie|caramel|pudding',
    'seafood': r'fish|lobster|crab|shrimp',
    'rice': r'rice bowl|white rice',
}

cuisine_list = list(cuisine_t.keys())
for k,v in cuisine_t.items():
    df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1 

df2[cuisine_list] = df2[cuisine_list].fillna(0)
df2['sum'] = df2[cuisine_list].sum(axis=1)
df2.loc[df2['sum'] == 0].shape

df2['tags'] = df2[cuisine_list].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)
df2.loc[df2.item_type_new.str.contains('wing')][['clean_name','item_type','item_type_new','tags']]

  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1
  df2.loc[df2['item_type_new'].str.contains(v, regex=True), k] = 1


Unnamed: 0,clean_name,item_type,item_type_new,tags
14908,buffalo mild wings,buffalo mild wings,buffalo mild wings,[american]
14909,buffalo mild wings,buffalo mild wings,buffalo mild wings,[american]
14910,buffalo mild wings,buffalo mild wings,buffalo mild wings,[american]
14911,buffalo mild wings,buffalo mild wings,buffalo mild wings,[american]
14912,buffalo mild wings,buffalo mild wings,buffalo mild wings,[american]
...,...,...,...,...
99474,honey hot bone wings,honey hot bone wings,honey hot bone wings,[american]
99665,honey mustard bone wings,honey mustard bone wings,honey mustard bone wings,[american]
99713,jumbo bonein chicken wings,jumbo bonein chicken wings,jumbo bonein chicken wings,[american]
99714,jumbo bonein chicken wings,jumbo bonein chicken wings,jumbo bonein chicken wings,[american]


In [405]:
valid_df = df2.loc[df2['sum'] >= 1]
# valid_df['tags'] = valid_df[cuisine_list].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)
unlabel_df = df2.loc[df2['sum'] == 0].reset_index(drop=True)
unlabel_df['tags'] = unlabel_df[list(cuisine_t.keys())].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)
# valid_df.loc[valid_df.item_type_new.str.contains('red curry')][['clean_name','item_type','item_type_new','tags']]

In [406]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MultiLabelBinarizer
'''
logi_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_features=200)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1))
])

nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_features=200)),
    ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None)))
     ])
'''
xgb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_features=200)),
    ('clf', OneVsRestClassifier(GradientBoostingClassifier()))
     ])


mlb = MultiLabelBinarizer(classes=cuisine_list)
mlb.fit(cuisine_list)

X = valid_df['item_type_new']
y = mlb.transform(valid_df['tags'])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=1)

model_list = {'XGB': xgb_pipe} #,'Logistic': logi_pipe,'Naive Bayes': nb_pipe}


for k,v in model_list.items():
    print(f"Fitting {k} Model to data")
    v.fit(x_train, y_train)
    
    test_predict = v.predict(x_test)
    train_predict = v.predict(x_train)
    print(f'{k} F1 for train for is {classification_report(y_train, train_predict, target_names=list(cuisine_t.keys()))}')
    print(f'{k} F1 for test for is {classification_report(y_test, test_predict, target_names=list(cuisine_t.keys()))}')
    print(f'{k} Hamming Loss is {hamming_loss(y_test, test_predict)}')

    


Fitting XGB Model to data
XGB F1 for train for is                precision    recall  f1-score   support

      italian       1.00      0.93      0.96      5406
   vietnamese       0.99      0.85      0.92       658
       korean       0.33      0.52      0.41       131
       indian       0.99      0.86      0.92      2105
     southern       1.00      0.95      0.97      2658
mediterranean       1.00      0.92      0.96      2421
    breakfast       1.00      0.97      0.98     11231
     american       1.00      0.98      0.99     14050
      chinese       1.00      0.85      0.92      1893
     japanese       0.98      0.90      0.94      3105
      mexican       1.00      0.93      0.96      6849
        latin       0.90      0.08      0.15       232
         thai       0.97      0.96      0.96      2011
   sandwiches       1.00      1.00      1.00      2567
         soup       1.00      1.00      1.00       421
       coffee       1.00      0.94      0.97       638
       drinks 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [407]:


ux = unlabel_df['item_type_new']
uy = mlb.transform(unlabel_df['tags'])

y_pred = xgb_pipe.predict(ux)
y_pred_tags = mlb.inverse_transform(y_pred)

y_tags = pd.Series(map(list, y_pred_tags), name='pred_tags')
pred_df = unlabel_df.merge(y_tags, left_index=True, right_index=True)

for i in cuisine_list:
    pred_df[i] = pred_df.apply(lambda x: 1 if i in x['pred_tags'] else 0, axis = 1)

pred_df['sum'] = pred_df[cuisine_list].sum(axis=1)
#pred_df.loc[(pred_df['sum'] == 0)].groupby(['clean_name','item_type','item_type_new'], as_index=False, group_keys=False)['accepted_orders'].sum().sort_values(by='accepted_orders', ascending=False)
pred_df.loc[(pred_df['sum'] == 0)].shape[0] / df.shape[0]

0.11015

In [412]:
final_cols = ['date',
 'hour',
 'name',
 'requested_orders',
 'accepted_orders',
 'completed_orders',
 'first_time_orders',
 'first_time_orders_promo',
 'returning_orders',
 'returning_orders_promo',
 'order_issues',
 'avg_prep_time',
 'avg_rating',
 'total_eater_spend',
 'total_eater_discount',
 'total_eater_revenue',
 'total_orders_promo',
 'completion_rate',
 'acceptance_rate',
 'order_issue_rate',
 'first_time_order_rate',
 'returning_order_rate',
 'avg_prep_time_min',
 'spend_per_prep_min',
 'date_str',
 'dayofweek',
 'dayname',
 'clean_name',
 'item_type',
 'item_type_new',
 'tags']



final_df = pd.concat([valid_df, unlabel_df])
final_pred_df = pd.concat([valid_df, pred_df])

final_df = final_df[final_cols]
final_pred_df = final_pred_df[final_cols]
if (final_df.shape[0] == raw_df.shape[0]) & (final_pred_df.shape[0] == raw_df.shape[0]):
    final_df.to_csv('data.csv', header=True, index=False)
    final_pred_df.to_csv('pred_data.csv', header=True, index=False)
else:
    print('This shit broke')

In [335]:
unlabel_df.loc[unlabel_df.dayofweek.isnull()]

Unnamed: 0,date,hour,name,requested_orders,accepted_orders,completed_orders,first_time_orders,first_time_orders_promo,returning_orders,returning_orders_promo,...,soup,coffee,drinks,hawaiian,healthy,sweets,seafood,rice,sum,tags


In [58]:
mask = valid_df['tags'].apply(lambda x: x == ['italian','thai'])
valid_df[mask][['name','item_type_new']].drop_duplicates()

Unnamed: 0,name,item_type_new


In [59]:
def get_rolling_df(df, grp, freq, freq_str, sum_col):
    df.sort_values(by=['date', 'hour'], inplace=True)
    if freq=='D':
        df.groupby(['date', f'{grp}'], as_index=False)


    df[f'{freq}_{sum_col}'] = df.groupby(grp, as_index=False, group_keys=False).apply(get_rolling_amount, freq_str, sum_col)

    return df[['date',f'{grp}',f'{sum_col}',f'{freq}_{sum_col}']]

df.groupby(['date', 'item_type'], as_index=False).sum().sort_values(by=['item_type','date']).tail(50)

Unnamed: 0,date,item_type,hour,requested_orders,accepted_orders,completed_orders,first_time_orders,first_time_orders_promo,returning_orders,returning_orders_promo,...,vegetarian,caribbean,drinks,hawaiian,healthy,sweets,seafood,rice,sum,thai
13373,2022-02-20,zak,198,36,36,26,23,22,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15376,2022-02-21,zak,160,30,30,25,22,19,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17252,2022-02-22,zak,182,20,20,17,13,13,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,2022-02-23,zak,112,15,15,12,9,8,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21100,2022-02-24,zak,176,26,26,24,20,20,5,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23116,2022-02-25,zak,130,22,22,21,18,18,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25181,2022-02-26,zak,117,16,16,14,10,10,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27262,2022-02-27,zak,170,22,22,19,0,0,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27537,2022-02-28,zak,0,3,3,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1329,2022-02-14,zak breakfast sandwich,19,1,1,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [None]:
y_train

In [None]:
orders = valid_df[['clean_name','completed_orders']].groupby('clean_name').sum().reset_index()

xp.bar(orders.sort_values(by='completed_orders', ascending=False)[:50], x='clean_name', y='completed_orders')

In [None]:
set(df.loc[df.clean_name.str.contains('fries')]['clean_name'].tolist())

In [None]:
from thefuzz import fuzz, process

word = 'iced coffee  cold brew'
test = process.extract('everything but kitchen sink breakfast burrito', set(df['clean_name'].tolist()), scorer=fuzz.token_sort_ratio, limit=6)
score_words = list(filter(lambda x: x[1] >= 90, test))
# words = [x[0] for x in test if x[0] != word]
#filtered_words = [x[0] for x in score_words]
score_words.sort(key=lambda x: len(x[0])/(x[1]))
score_words

In [None]:
def find_root_item_type(word, word_list, limit, min_score=90):
    _words = process.extract(word, word_list, scorer=fuzz.token_sort_ratio, limit=limit)
    score_words = list(filter(lambda x: x[1] >= min_score, _words))
    filtered_words = [x[0] for x in score_words]
    if len(filtered_words) > 1:
        filtered_words.sort(key=len)
    elif len(filtered_words) == 0:
        filtered_words = list(word)
        
    return filtered_words[0]
 


In [None]:
%%time
from pandarallel import pandarallel

pandarallel.initialize()

item_df = pd.DataFrame(all_items,columns=['item_type'])
item_df['item_type_new'] = item_df.parallel_apply(lambda x: find_root_item_type(x['item_type'], all_items, min_score=85, limit=6), axis=1)
# test['item_type'] = test.apply(lambda x: find_root_item_type(x['clean_name'], all_items, min_score=80, limit=6), axis=1)

In [None]:
df.merge(item_df, on='item_type')[['name','clean_name','item_type','item_type_new']].tail(50)

In [None]:
process.extract('basmnati rice', all_items, scorer=fuzz.token_sort_ratio, limit=10)

In [None]:
item_df.loc[item_df.item_type_new != item_df.item_type].shape

In [None]:
df.loc[df.clean_name.str.contains('bacon egg  cheese  bagel')][['name','clean_name']].drop_duplicates()

[('bacon egg cheese   bagel', 100),
 ('bacon egg  cheese   bagel', 100),
 ('bacon egg cheese bagel', 100),
 ('turkey bacon egg cheese bagel', 86),
 ('egg  cheese   bagel', 84),
 ('bagel egg  cheese', 84),
 ('bacon egg  cheese', 84),
 ('egg cheese bagel', 84),
 ('bacon egg cheese', 84),
 ('egg cheese   bagel', 84),
 ('bangin bacon cheese fries', 77),
 ('ham egg  cheese   bagel', 76),
 ('ham egg cheese bagel', 76),
 ('ham egg cheese   bagel', 76),
 ('beaming bacon cheese fries', 75),
 ('bacon egg cheese tots', 74),
 ('bacon egg cheese avocado', 74),
 ('bacon egg cheese fries', 73),
 ('egg bacon cheeseburger', 73),
 ('booked bacon cheese fries', 72)]

In [None]:
process.extract('bacon breakfast burrito', all_items, scorer=fuzz.token_sort_ratio, limit=20)

In [None]:
item_df.loc[item_df.item_type.str.contains('basmati')]

In [None]:
len(all_items), len(set(df['clean_name'].tolist()))

In [None]:
unlabel_df.loc[unlabel_df['clean_name'].str.contains('basmnati')][['name']]

In [None]:
m = Word2Vec(unlabel_df['item_type'].apply(word_tokenize))
m.wv.key_to_index

katsu plate
[('pork katsu plate', 81), ('meat plate', 76), ('kabob plate', 73), ('chicken katsu plate', 73), ('pakora plate', 70)]
[]
[]


'k'

In [None]:
import plotly.express as xp

figdf = df[list(cuisine_t.keys())].sum().reset_index()
figdf.columns = ['cuisine', 'ct']

xp.bar(figdf.sort_values(by='ct', ascending=False), x='cuisine', y='ct', color='cuisine')

In [None]:
fig2df = df['sum'].value_counts().reset_index()
fig2df.columns = ['label_ct','ct']
xp.bar(fig2df, x='label_ct', y='ct')

In [145]:
df.loc[df.returning_orders > df.returning_orders_promo].columns

Index(['date', 'hour', 'name', 'requested_orders', 'accepted_orders',
       'completed_orders', 'first_time_orders', 'first_time_orders_promo',
       'returning_orders', 'returning_orders_promo', 'order_issues',
       'avg_prep_time', 'avg_rating', 'total_eater_spend',
       'total_eater_discount', 'clean_name', 'item_type', 'item_type_new',
       'italian', 'vietnamese', 'korean', 'indian', 'southern',
       'mediterranean', 'breakfast', 'american', 'chinese', 'japanese',
       'latin', 'thai', 'sandwiches', 'soup', 'coffee', 'vegetarian',
       'caribbean', 'drinks', 'hawaiian', 'healthy', 'sweets', 'seafood',
       'rice', 'sum'],
      dtype='object')

In [146]:
df['completed_orders'] / df['accepted_orders']

0        0.916667
1        0.958904
2        0.957143
3        0.984375
4        0.968750
           ...   
99995    0.000000
99996    0.000000
99997    0.000000
99998    0.000000
99999    0.000000
Length: 100000, dtype: float64

In [339]:
final_df.iloc[:,:20].corr()

Unnamed: 0,hour,requested_orders,accepted_orders,completed_orders,first_time_orders,first_time_orders_promo,returning_orders,returning_orders_promo,order_issues,avg_prep_time,avg_rating,total_eater_spend,total_eater_discount,total_eater_revenue,total_orders_promo,completion_rate,acceptance_rate,avg_prep_time_min
hour,1.0,-0.025221,-0.024909,-0.025301,0.001293,0.023889,-0.056397,-0.032275,-0.033658,-0.055747,-0.03178,-0.011018,0.006451,-0.01195,0.005244,0.012932,-0.000259,-0.055747
requested_orders,-0.025221,1.0,0.999041,0.990927,0.867886,0.748973,0.790888,0.687487,0.464363,-0.009722,0.023685,0.067171,-0.008361,0.119304,0.847814,0.034765,0.009267,-0.009722
accepted_orders,-0.024909,0.999041,1.0,0.990256,0.866827,0.749298,0.790862,0.687814,0.46476,-0.00973,0.023832,0.067194,-0.008427,0.119293,0.848193,0.029892,0.044854,-0.00973
completed_orders,-0.025301,0.990927,0.990256,1.0,0.877803,0.756327,0.795794,0.692181,0.464412,-0.009088,0.025242,0.067999,-0.008611,0.121309,0.855311,0.138427,0.015216,-0.009088
first_time_orders,0.001293,0.867886,0.866827,0.877803,1.0,0.877363,0.561455,0.497322,0.468045,-0.004682,0.005061,0.056878,-0.006067,0.105288,0.869281,0.133298,0.009626,-0.004682
first_time_orders_promo,0.023889,0.748973,0.749298,0.756327,0.877363,1.0,0.449997,0.417835,0.439587,-0.005947,-0.003104,0.050272,-0.009861,0.089975,0.930868,0.113462,0.02268,-0.005947
returning_orders,-0.056397,0.790888,0.790862,0.795794,0.561455,0.449997,1.0,0.868649,0.381081,-0.004293,0.074986,0.075303,-0.012296,0.132347,0.692596,0.068771,0.019383,-0.004293
returning_orders_promo,-0.032275,0.687487,0.687814,0.692181,0.497322,0.417835,0.868649,1.0,0.355163,-0.007441,0.074425,0.074108,-0.018324,0.124976,0.720883,0.067222,0.021093,-0.007441
order_issues,-0.033658,0.464363,0.46476,0.464412,0.468045,0.439587,0.381081,0.355163,1.0,0.00259,0.058897,0.433743,-0.412347,0.423899,0.448516,0.007127,0.029579,0.00259
avg_prep_time,-0.055747,-0.009722,-0.00973,-0.009088,-0.004682,-0.005947,-0.004293,-0.007441,0.00259,1.0,0.006482,0.005918,0.00309,0.010989,-0.007298,0.006317,-0.000204,1.0


In [362]:
df.columns

Index(['date', 'hour', 'name', 'requested_orders', 'accepted_orders',
       'completed_orders', 'first_time_orders', 'first_time_orders_promo',
       'returning_orders', 'returning_orders_promo', 'order_issues',
       'avg_prep_time', 'avg_rating', 'total_eater_spend',
       'total_eater_discount', 'total_eater_revenue', 'total_orders_promo',
       'completion_rate', 'acceptance_rate', 'order_issue_rate',
       'avg_prep_time_min', 'spend_per_prep_min', 'date_str', 'dayofweek',
       'dayname', 'clean_name', 'item_type', 'item_type_new'],
      dtype='object')

In [409]:
df[['dayofweek','dayname']].drop_duplicates()

Unnamed: 0,dayofweek,dayname
0,6,Sunday
2,5,Saturday
7,4,Friday
11,0,Monday
19,1,Tuesday
22,3,Thursday
23,2,Wednesday


In [411]:
final_df.columns.tolist()

['date',
 'hour',
 'name',
 'requested_orders',
 'accepted_orders',
 'completed_orders',
 'first_time_orders',
 'first_time_orders_promo',
 'returning_orders',
 'returning_orders_promo',
 'order_issues',
 'avg_prep_time',
 'avg_rating',
 'total_eater_spend',
 'total_eater_discount',
 'total_eater_revenue',
 'total_orders_promo',
 'completion_rate',
 'acceptance_rate',
 'order_issue_rate',
 'first_time_order_rate',
 'returning_order_rate',
 'avg_prep_time_min',
 'spend_per_prep_min',
 'date_str',
 'dayofweek',
 'dayname',
 'clean_name',
 'item_type',
 'item_type_new',
 'italian',
 'vietnamese',
 'korean',
 'indian',
 'southern',
 'mediterranean',
 'breakfast',
 'american',
 'chinese',
 'japanese',
 'mexican',
 'latin',
 'thai',
 'sandwiches',
 'soup',
 'coffee',
 'drinks',
 'hawaiian',
 'healthy',
 'sweets',
 'seafood',
 'rice',
 'sum',
 'tags']