In [1]:
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:,.2f}'.format

import seaborn as sns

sns.set_palette('dark')
sns.set(font_scale = 1.5)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC

from sklearn.metrics import (classification_report, accuracy_score)

In [2]:
# Each entity has an associated unique id.
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')

# Which products were purchased in each order
# 'reordered' indicates that the customer has a previous order that contains the product. 
# Note that some orders will have no reordered items. 
# You may predict an explicit 'None' value for orders with no reordered items. 
order_products__prior = pd.read_csv('order_products__prior.csv') # contains previous order contents for all customers
order_products__train = pd.read_csv('order_products__train.csv')

# Which set (prior, train, test) an order belongs. 
# Predict reordered items only for the test set orders. 
orders = pd.read_csv('orders.csv')

products = pd.read_csv('products.csv')

sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
prior_df = order_products__prior.copy()
prior_df = pd.merge(prior_df, orders, on='order_id', how='left')
prior_df = pd.merge(prior_df, products, on='product_id', how='left')
prior_df = pd.merge(prior_df, departments, on='department_id', how='left')
prior_df = pd.merge(prior_df, aisles, on='aisle_id', how='left')
prior_df.shape 

(32434489, 15)

In [4]:
train_df = order_products__train.copy()
train_df = pd.merge(train_df, orders, on='order_id', how='left')
train_df = pd.merge(train_df, products, on='product_id', how='left')
train_df = pd.merge(train_df, departments, on='department_id', how='left')
train_df = pd.merge(train_df, aisles, on='aisle_id', how='left')
train_df.shape

(1384617, 15)

In [5]:
product_info = pd.merge(pd.merge(products, departments, on ='department_id', how = 'left'), aisles, on='aisle_id', how = 'left')
product_info.head(3)

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,aisle
0,1,Chocolate Sandwich Cookies,61,19,snacks,cookies cakes
1,2,All-Seasons Salt,104,13,pantry,spices seasonings
2,3,Robust Golden Unsweetened Oolong Tea,94,7,beverages,tea


In [6]:
def calculate_f1_score(row):
    """
    pred: list of product ids
    true: list of product ids
    """
    pred = row['products_pred'].split()
    true = row['products_true'].split()
    
    pred = set(pred)
    true = set(true)
    correct_predictions = pred.intersection(true)
    try:
        precision = len(correct_predictions)/len(pred)
        recall = len(correct_predictions)/len(true)

        f_score = 2*precision*recall/(precision + recall)
        return f_score
    except ZeroDivisionError:  
        return 0

In [7]:
# transform the training set to the output format

turn_products_to_output_format = lambda x: ' '.join([str(int(e)) for e in set(x)]) if len(x) else 'None'

train = pd.DataFrame(train_df.groupby('order_id')["product_id"].apply(turn_products_to_output_format))\
                    .reset_index().rename(columns={'product_id':'products_true'})

train.head()

Unnamed: 0,order_id,products_true
0,1,11109 10246 47209 22035 43633 49683 49302 13176
1,36,34497 46979 48679 46620 19660 43086 49235 39612
2,38,42625 39693 23622 11913 28842 4461 18159 21616...
3,96,40706 24489 25610 39275 20574 30391 27966
4,98,46720 24964 4357 43654 18441 36364 34065 19731...


In [8]:
df_train, df_validate = train_test_split(train, test_size = 0.8, random_state=42)

In [9]:
validate_orders = pd.merge(df_validate, orders, on='order_id', how = 'left')
validate_orders.head()

Unnamed: 0,order_id,products_true,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,3007878,9366 3682 18531 9092 27845 1862 17862 41004 23...,173665,train,23,5,18,4.0
1,1727889,42058 9389 32047 21137 19348 11351 12218 5450 927,62696,train,48,3,15,7.0
2,2829462,11777 23586 36835 13380 41349 21894 37959 5449...,196050,train,4,0,8,30.0
3,3261021,10017 1090 23815 8138 39180 39470 21903 49520 ...,42910,train,44,0,10,7.0
4,1711491,5451 37508 25194 30391 40749 38159 16978 31990...,137671,train,10,3,16,30.0


In [10]:
%%time
history = prior_df[(prior_df.user_id.isin(validate_orders.user_id))
                             & (prior_df.reordered == 1)]\
.groupby('user_id')['product_id'].apply(turn_products_to_output_format).reset_index()
history.columns = ['user_id', 'products_pred']

result_df = pd.merge(left=history, 
                        right=validate_orders, 
                        how='right', 
                        on='user_id').fillna('None')[['order_id', 'products_pred','products_true']]

print('F1 Score', np.average(result_df.apply(calculate_f1_score, axis=1, raw=True)))

('F1 Score', 0.0063828976449965703)
CPU times: user 22 s, sys: 9.39 s, total: 31.4 s
Wall time: 32.6 s


In [11]:
%%time
history = prior_df[(prior_df.user_id.isin(validate_orders.user_id))]
# This is assuming that order number is ordered. The max number of the order_number is the last order.
last_orders = history.groupby('user_id')['order_number'].max().reset_index()

last_ordered_reordered_only = pd.merge(
            left=pd.merge(
                    left=last_orders,
                    right=history[history.reordered == 1],
                    how='left',
                    on=['user_id', 'order_number']
                )[['user_id', 'product_id']],
            right=validate_orders[['user_id', 'order_id']],
            how='left',
            on='user_id'
        )

CPU times: user 9.11 s, sys: 3.92 s, total: 13 s
Wall time: 14 s


In [12]:
last_ordered_reordered_only.head()

Unnamed: 0,user_id,product_id,order_id
0,1,196.0,1187899
1,1,46149.0,1187899
2,1,25133.0,1187899
3,1,10258.0,1187899
4,1,13032.0,1187899


In [13]:
%%time
result_df = last_ordered_reordered_only.fillna(-1).groupby('order_id')['product_id'].apply(turn_products_to_output_format)\
                .reset_index().replace(to_replace='-1', value='None').rename(columns={'product_id':'products_pred'})
    
result_df = pd.merge(left = result_df, right = validate_orders[['order_id', 'products_true']], on='order_id').fillna('None')

CPU times: user 6.86 s, sys: 118 ms, total: 6.98 s
Wall time: 7.05 s


In [14]:
result_df.head()

Unnamed: 0,order_id,products_pred,products_true
0,1,30881 5707 14947 43633,11109 10246 47209 22035 43633 49683 49302 13176
1,36,35939 24964 26629 44359 47734 16759 581,34497 46979 48679 46620 19660 43086 49235 39612
2,96,24489 27966,40706 24489 25610 39275 20574 30391 27966
3,98,4357 43654 34065 19731 45204 33686 40986 8859 ...,46720 24964 4357 43654 18441 36364 34065 19731...
4,170,13176 25748 18394 6236 37766,49593 40354 39190 37766 43841 43772 17794 1118...


In [15]:
print('F1 Score', np.average(result_df.apply(calculate_f1_score, axis=1, raw=True)))

('F1 Score', 0.014804511851230851)


In [16]:
result_df

Unnamed: 0,order_id,products_pred,products_true
0,1,30881 5707 14947 43633,11109 10246 47209 22035 43633 49683 49302 13176
1,36,35939 24964 26629 44359 47734 16759 581,34497 46979 48679 46620 19660 43086 49235 39612
2,96,24489 27966,40706 24489 25610 39275 20574 30391 27966
3,98,4357 43654 34065 19731 45204 33686 40986 8859 ...,46720 24964 4357 43654 18441 36364 34065 19731...
4,170,13176 25748 18394 6236 37766,49593 40354 39190 37766 43841 43772 17794 1118...
5,218,1194,38557 10305 1194 5578 38159
6,226,45349,39947 28199 7754 28427 47501 39275 36291 24852...
7,349,32864 33198 21982 19862 27695,11520 11361 45633 33000 30830 27695 36968 2571...
8,393,6184 32403 19828,6184 12078 13424 32403 19828 12341 16797 30591
9,456,15130,35365 47626 4972 14992 31506 18196 34229 48118...


### LinearSVC

In [17]:
# Prepocessing

import re

def remove_number_punct(string):
    string = str(string)
    string = re.sub('[0-9\(\)\!\^\%\$\"\.;,\?\{\}\[\]\\/]', ' ', string)
    string = re.sub(' +', ' ', string)
    return string

def preprocess_product_name(row,stops=None):
    string = remove_number_punct(row['product_name'])
    return string.lower()

product_info['cleaned_product_name'] = product_info.apply(preprocess_product_name, axis=1)

In [18]:
train_df = product_info.loc[product_info['aisle']!='missing',['cleaned_product_name','aisle']]
test_df = product_info.loc[product_info['aisle']=='missing',['cleaned_product_name','product_id','product_name']]

In [19]:
vectorizer = TfidfVectorizer(min_df=2,smooth_idf=True,ngram_range=(1, 2),sublinear_tf = True)
clean_pro_tfidf = vectorizer.fit(product_info.cleaned_product_name.tolist())

X_train, X_test, y_train, y_test = train_test_split(train_df.cleaned_product_name,train_df.aisle,
                                                 test_size=0.3, random_state=42)

X_train_tfidf = clean_pro_tfidf.transform(X_train)
X_test_tfidf = clean_pro_tfidf.transform(X_test)

In [20]:
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

y_predicted = clf.predict(X_test_tfidf)
accuracy_score(y_test, y_predicted)

0.76089200908527777