In [2]:
import numpy as np
import pandas as pd

from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

from sklearn.decomposition import NMF
from sklearn.metrics import mean_absolute_error

In [20]:
# orders = pd.read_csv("../../instacart_2017_05_01/orders.csv")


# order_ids_test = np.loadtxt("../../instacart_2017_05_01/order_ids_test.txt", dtype=np.int)
# order_ids_train = np.loadtxt("../../instacart_2017_05_01/order_ids_train.txt", dtype=np.int)

# orders_train = orders.loc[order_ids_train, :]
# orders_test = orders.loc[order_ids_test, :]

In [24]:
# filter user with more than 10 orders
orders = pd.read_csv("../../instacart_2017_05_01/orders.csv")
print(orders.shape)
orders = orders[orders.eval_set=="prior"]
user_freq = orders.user_id.value_counts()
user_freq = user_freq[user_freq>50]
orders = orders[orders.user_id.isin(user_freq.index)]
print(orders.shape)

(3421083, 7)
(722360, 7)


In [25]:
# user profile
user_dows = orders[["order_dow", "user_id"]].groupby("user_id").agg(lambda x:x.value_counts().index[0])
user_profiles = orders[["order_hour_of_day", "days_since_prior_order", "user_id"]].groupby("user_id").mean()
user_profiles = user_profiles.merge(user_dows, left_index=True, right_index=True)
print(user_profiles.shape)
user_profiles.head()

(10328, 3)


Unnamed: 0_level_0,order_hour_of_day,days_since_prior_order,order_dow
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27,12.469136,4.4375,2
50,11.253731,5.409091,1
54,12.844156,4.776316,4
90,12.126761,3.471429,5
140,13.986842,4.84,6


In [26]:
# filter product with more than 10 purchases
order_products = pd.read_csv("../../instacart_2017_05_01/order_products__prior.csv")
order_products = order_products[["order_id", "product_id"]]
print(order_products.shape)
product_freq = order_products.product_id.value_counts()
product_freq = product_freq[product_freq>10]
order_products = order_products[order_products.product_id.isin(product_freq.index)]
order_products = order_products[order_products.order_id.isin(orders.order_id.unique())]
order_products = order_products.merge(orders[["order_id", "user_id"]])
print(order_products.shape)
order_products.head()

(32434489, 2)
(7042185, 3)


Unnamed: 0,order_id,product_id,user_id
0,4,46842,178520
1,4,26434,178520
2,4,39758,178520
3,4,27761,178520
4,4,10054,178520


In [27]:
# split train and test
order_ids = order_products.order_id.unique()
np.random.shuffle(order_ids)
order_ids_train = order_ids[0:int(order_ids.shape[0]*4/5)]
order_ids_test = order_ids[int(order_ids.shape[0]*4/5):]
np.savetxt("../../instacart_2017_05_01/order_ids_test.txt", order_ids_test, fmt="%d")
np.savetxt("../../instacart_2017_05_01/order_ids_train.txt", order_ids_train, fmt="%d")

In [28]:
# product profile
product_profiles = pd.read_csv("../../instacart_2017_05_01/products.csv")
print(product_profiles.shape)
product_profiles = product_profiles[product_profiles.product_id.isin(order_products.product_id.unique())]
print(product_profiles.shape)
product_profiles.head()

(49688, 4)
(38061, 4)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
6,7,Pure Coconut Water With Orange,98,7


# FP Growth

In [57]:
spark = SparkSession.builder.master("local").appName("Search").config(conf=SparkConf()).getOrCreate()

In [59]:
order_product = order_products[["order_id", "product_id"]].groupby("order_id").agg(lambda x:x.tolist())

In [88]:
order_product_test = order_product[order_product.index.isin(order_ids_test)]
order_product_test["input"] = order_product_test.apply(lambda x:x[0][:int(len(x[0]) *2/3)], axis=1)
order_product_test["output"] = order_product_test.apply(lambda x:x[0][int(len(x[0]) *2/3):], axis=1)
order_product_test = order_product_test[["input", "output"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [89]:
order_product_train = order_product[order_product.index.isin(order_ids_train)]
order_product_train.rename({"product_id":"input"}, axis=1, inplace=True)
order_product_test["id"] = order_product_test.index
order_product_train["id"] = order_product_train.index

In [104]:
train_sdf = spark.createDataFrame(order_product_train)
test_sdf = spark.createDataFrame(order_product_test)

In [114]:
fpGrowth = FPGrowth(itemsCol="input", minSupport=0.01, minConfidence=0.005)
model = fpGrowth.fit(train_sdf)

In [115]:
# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

+-------+-----+
|  items| freq|
+-------+-----+
|[41220]| 6453|
|[38159]| 6148|
|[27156]| 6868|
|[29447]| 5783|
|[27086]|14954|
|[37687]| 6012|
|[24489]| 6306|
|[46906]| 7130|
| [9387]| 6671|
|[28985]|10837|
|[43352]| 8952|
|[34969]| 8266|
|[39877]|13395|
|[10749]|11751|
|[38689]| 8823|
|[43122]|10374|
|[41950]|12690|
|[26209]|23990|
| [5876]|17890|
|[22825]|11565|
+-------+-----+
only showing top 20 rows

+----------+----------+-------------------+------------------+
|antecedent|consequent|         confidence|              lift|
+----------+----------+-------------------+------------------+
|   [28204]|   [24852]| 0.4098731315161412| 2.823901695861819|
|   [16797]|   [24852]|0.28518255937610776|1.9648214316304606|
|   [24852]|   [16797]|0.07673872348543562|1.9648214316304606|
|   [24852]|   [21137]|0.14309220331707784|1.3925366145768279|
|   [24852]|   [21903]|0.10844292885333079|1.4486266521448545|
|   [24852]|   [47766]|0.10388820660792426|2.2378352358124918|
|   [24852]|   [47626]|

In [116]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
test_pred = model.transform(test_sdf).select("*").toPandas()

In [118]:
test_pred

Unnamed: 0,input,output,id,prediction
0,"[46842, 26434, 39758, 27761, 10054, 21351, 225...","[40285, 17616, 25146, 32645, 41276]",4,[]
1,"[43875, 27845, 47049, 8186, 34134, 42345, 28682]","[36342, 12341, 11140, 23060]",69,"[21137, 13176, 24852]"
2,"[28745, 22124]","[26604, 5212]",104,[]
3,[17794],[18644],122,[]
4,"[4421, 18770, 24184, 13176, 42265, 16249]","[28278, 7010, 41290, 39984]",128,"[21137, 27966, 8277, 21903, 19057, 47209, 27845]"
...,...,...,...,...
144454,"[35842, 35951, 27966, 11143, 26209, 42265, 126...","[15689, 9070, 27057, 47209, 43122]",3420991,"[21137, 13176, 47209]"
144455,"[22372, 7625]","[16131, 32295]",3420993,[]
144456,"[10603, 43858, 35958, 35425, 2732, 3793, 21137...","[35547, 40063, 45081, 5322, 38730]",3421004,"[13176, 24852, 27966, 21903, 47209, 27845]"
144457,"[24561, 13113, 46906, 4799, 30949, 26790]","[40706, 8518, 35108, 300]",3421006,[]


In [121]:
# order_product_test = order_product_test.merge(test_pred[["id", "prediction"]], left_on='id', right_on="id") 
test_pred["hit"] = test_pred.apply(lambda x:len(set(x["prediction"]).intersection(x["output"])), axis=1)
test_pred["total"] = test_pred.apply(lambda x:len(x["output"]), axis=1)

In [126]:
sum(test_pred["hit"])/sum(test_pred["total"])

0.08469992234304334

# Negative Matrix Factorization

In [29]:
order_products_train = order_products[order_products.order_id.isin(order_ids_train)]
train = order_products_train.pivot_table(index="user_id", columns="product_id", aggfunc=len, fill_value=0)

In [None]:
test = order_products.pivot_table(index="user_id", columns="product_id", aggfunc=len, fill_value=0)

In [31]:
test = test[test.index.isin(train.index)]
test = test[train.columns]

In [34]:
train.shape
test.shape

(10328, 37398)

In [40]:
# Converting the ratings into binary ratings 1 (Liked) or 0 (Not Liked)
train[train == 0] = -1
train[train >= 1] = 1
test[test == 0] = -1
test[test >= 1] = 1

In [41]:
np.save("trian.npy", np.array(train))
np.save("test.npy", np.array(test))

In [44]:
train[train == -1] = 0
train[train >= 1] = 1
test[test == -1] = 0
test[test >= 1] = 1

In [47]:
nmf = NMF(n_components=3, 
         random_state=0,
         verbose=True)

user_distribution = nmf.fit_transform(train)
item_distribution = nmf.components_

reconstruct_matrix = np.dot(user_distribution, item_distribution)

violation: 1.0
violation: 0.38463191074795183
violation: 0.24976476138246662
violation: 0.18218455988271298
violation: 0.14249327619899194
violation: 0.11601171934354614
violation: 0.0972826386929419
violation: 0.08309583886010563
violation: 0.07207778605495882
violation: 0.06319974937171627
violation: 0.055912324148760746
violation: 0.0498760287152241
violation: 0.04494611935902424
violation: 0.04072943572197626
violation: 0.03713854794819671
violation: 0.03414759867578907
violation: 0.031612379346637964
violation: 0.029568665026005882
violation: 0.027765642513670023
violation: 0.02612471289992962
violation: 0.024607413533461787
violation: 0.02318988998680163
violation: 0.021867146406265407
violation: 0.020607215233743755
violation: 0.019411751053213154
violation: 0.01828650827159878
violation: 0.01722026580979524
violation: 0.016216727219307205
violation: 0.015269201197694243
violation: 0.014395263169070747
violation: 0.013575084233973005
violation: 0.012793117045118733
violation: 0.

In [52]:
print(mean_absolute_error(test, reconstruct_matrix))
print(mean_absolute_error(train, reconstruct_matrix))
print(mean_absolute_error(test, train))

0.007851114130976096
0.007449875120477275
0.00046836665029163343


In [53]:
nmf = NMF(n_components=5, 
         random_state=0,
         verbose=True)

user_distribution = nmf.fit_transform(train)
item_distribution = nmf.components_

reconstruct_matrix = np.dot(user_distribution, item_distribution)

violation: 1.0
violation: 0.4205794797462163
violation: 0.309877636861129
violation: 0.24607846527096727
violation: 0.21267385783331663
violation: 0.1882665843349099
violation: 0.16859568331483146
violation: 0.1514766174128897
violation: 0.13628267426102528
violation: 0.12240522756921024
violation: 0.11025952639457028
violation: 0.09918439753670596
violation: 0.08928878487878253
violation: 0.08033736426392275
violation: 0.07195453196224152
violation: 0.06435007530203904
violation: 0.057311978019191114
violation: 0.05092181523877545
violation: 0.045158147333612265
violation: 0.03990548138178395
violation: 0.03506235219060173
violation: 0.030724707102265394
violation: 0.026883425474123132
violation: 0.023446724513407433
violation: 0.020428550116165647
violation: 0.017754442015485476
violation: 0.015415319108411407
violation: 0.01336837905191067
violation: 0.011590615476934385
violation: 0.010061437956173548
violation: 0.00873704440440301
violation: 0.007610808920189908
violation: 0.00663

In [128]:
print(mean_absolute_error(test, reconstruct_matrix))
print(mean_absolute_error(train, reconstruct_matrix))
print(mean_absolute_error(test, train))

0.005785982087505418
0.007389462159671093
0.00046836665029163343


In [55]:
nmf = NMF(n_components=10, 
         random_state=0,
         verbose=True)

user_distribution = nmf.fit_transform(train)
item_distribution = nmf.components_

reconstruct_matrix = np.dot(user_distribution, item_distribution)

violation: 1.0
violation: 0.37149155997787797
violation: 0.2992321929579902
violation: 0.250459435270304
violation: 0.20672044575889714
violation: 0.17081398018282706
violation: 0.14198657391929978
violation: 0.11892296771310745
violation: 0.10021997607701753
violation: 0.08544693567513147
violation: 0.07367533710524803
violation: 0.06450311409877486
violation: 0.05725313213699997
violation: 0.05155161140722181
violation: 0.04687706011124386
violation: 0.0431560464983216
violation: 0.04017688853131575
violation: 0.037978460590965074
violation: 0.0363837812343241
violation: 0.03512564989107793
violation: 0.03422192174044633
violation: 0.0337451510763066
violation: 0.03362971323612447
violation: 0.033680186687531435
violation: 0.03376806526612612
violation: 0.03387602661314685
violation: 0.03400225184164946
violation: 0.03410632337340607
violation: 0.03416965211408508
violation: 0.03421070041292088
violation: 0.03416335597322255
violation: 0.033874198574798765
violation: 0.03324201440351

In [56]:
print(mean_absolute_error(test, reconstruct_matrix))
print(mean_absolute_error(train, reconstruct_matrix))
print(mean_absolute_error(test, train))

0.007813905387676595
0.007422535762452923
0.00046836665029163343
