### 1. Uvoz in obdelava podatkov

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Stolpci: id prvega izdelka, id drugega izdelka, števec ponovitev teh produktov
pairs = pd.read_csv('../data/pairs.csv')
pairs.head()

Unnamed: 0,product_one,product_two,count
0,21137,13176,57768
1,47209,13176,57706
2,24852,21137,52791
3,24852,47766,49986
4,24852,21903,48151


In [3]:
# Testna množica uporabnikov
orders_test = pd.read_csv('../data/orders_test.csv')
orders_test.head()

Unnamed: 0,order_id,user_id,eval_set,order_number
0,2625565,21,prior,1
1,2698037,21,prior,2
2,1166994,21,prior,3
3,129152,21,prior,4
4,62373,21,prior,5


In [4]:
# Vsa naročila z vsemi izdelki za referenco
all_orders = pd.read_csv('../data/order_products__prior.csv')[['order_id', 'product_id']].merge(pd.read_csv('../data/order_products__train.csv')[['order_id', 'product_id']], how='outer')
all_orders.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [5]:
# Testna množica, združena s kupljenimi izdelki
orders_test_products = orders_test[orders_test['eval_set'] == 'prior'].merge(all_orders)[['order_id', 'user_id', 'product_id']]
orders_test_products.head()

Unnamed: 0,order_id,user_id,product_id
0,2625565,21,23729
1,2625565,21,43645
2,2625565,21,46842
3,2625565,21,44156
4,2625565,21,18721


In [6]:
# Frekvence napupov izdelkov za uporabnike
products_count = orders_test_products.groupby(['user_id', 'product_id']).size().to_frame('count').reset_index().sort_values(by=['user_id', 'count'], ascending=[True, False]).reset_index(drop=True)
products_count.head()

Unnamed: 0,user_id,product_id,count
0,21,23729,21
1,21,48988,18
2,21,47766,7
3,21,28204,6
4,21,44632,6


In [7]:
# Število izdelkov, ki jih je kupec kupil
num_of_user_products = products_count.groupby('user_id')['count'].sum().to_frame('num_of_products').reset_index()
num_of_user_products.head()

Unnamed: 0,user_id,num_of_products
0,21,205
1,24,38
2,52,169
3,58,95
4,65,132


In [8]:
# Število naročil, ki jih je opravil kupec
num_of_user_orders = orders_test.groupby('user_id')['order_id'].size().to_frame('num_of_orders').reset_index()
num_of_user_orders.head()

Unnamed: 0,user_id,num_of_orders
0,21,34
1,24,19
2,52,28
3,58,15
4,65,15


In [9]:
# Združitev zgornjih dveh podatkov
user_average_basket = num_of_user_orders.merge(num_of_user_products)
user_average_basket.head()

Unnamed: 0,user_id,num_of_orders,num_of_products
0,21,34,205
1,24,19,38
2,52,28,169
3,58,15,95
4,65,15,132


In [10]:
# Izračun povprečne velikosti košarice za vsakega kupca,
# saj mu bo priporočilni sistem priporočil natanko toliko izdelkov
user_average_basket['average_basket_size'] = np.ceil(user_average_basket['num_of_products'] / user_average_basket['num_of_orders']).astype(np.int64)
user_average_basket.head()

Unnamed: 0,user_id,num_of_orders,num_of_products,average_basket_size
0,21,34,205,7
1,24,19,38,2
2,52,28,169,7
3,58,15,95,7
4,65,15,132,9


<br>

### 2. Priporočilni sistem

In [13]:
# V tabeli `products_count` odstranimo izdelke, ki jih ni v parih podobnosti
products_count = products_count[products_count['product_id'].isin(pairs['product_one'].append(pairs['product_two'], ignore_index=True))]

In [18]:
# Enolični izdelki, za katere bomo napovedovali nove
product_ids = products_count[['product_id']].drop_duplicates(keep='first').reset_index(drop=True)
product_ids.head()

Unnamed: 0,product_id
0,23729
1,48988
2,47766
3,28204
4,44632


In [19]:
# Prva tabela parov in njihovih frekvenc skupne pojavitve
frequencies_one = product_ids.merge(pairs, left_on='product_id', right_on='product_one')[['product_id', 'product_two', 'count']].sort_values(by=['product_id', 'count'], ascending=[True, False]).groupby('product_id').head(1).reset_index(drop=True)
frequencies_one.columns = ['product_id', 'recommended', 'count']
frequencies_one.head()

Unnamed: 0,product_id,recommended,count
0,1,11759,222
1,2,21903,8
2,3,13176,48
3,4,15925,30
4,6,18479,2


In [20]:
# Druga tabela parov in njihovih frekvenc skupne pojavitve
frequencies_two = product_ids.merge(pairs, left_on='product_id', right_on='product_two')[['product_id', 'product_one', 'count']].sort_values(by=['product_id', 'count'], ascending=[True, False]).groupby('product_id').head(1).reset_index(drop=True)
frequencies_two.columns = ['product_id', 'recommended', 'count']
frequencies_two.head()

Unnamed: 0,product_id,recommended,count
0,1,6184,242
1,2,28204,9
2,3,24852,40
3,4,4846,81
4,6,48745,3


In [21]:
# Tabeli združimo
frequencies = frequencies_one.append(frequencies_two, ignore_index=True)
frequencies.head()

Unnamed: 0,product_id,recommended,count
0,1,11759,222
1,2,21903,8
2,3,13176,48
3,4,15925,30
4,6,18479,2


In [22]:
frequencies = frequencies.sort_values(by=['product_id', 'count'], ascending=[True, False]).groupby('product_id').head(1).reset_index(drop=True)

<br>

### 3. Preizkus sistema

In [24]:
products = pd.read_csv('../data/products.csv')
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [25]:
frequencies.merge(products)[['recommended', 'product_name']].merge(products, left_on='recommended', right_on='product_id')[['product_name_x', 'product_name_y']].sample(frac=1).head()

Unnamed: 0,product_name_x,product_name_y
39048,Smoked Nova Salmon,Bagel Thins Everything
17349,Venus Embrace Women's Razor Handle and 2 Razor...,Banana
14349,Pepperidge Farm Colors Cheddar Baked Snack Cra...,Banana
30470,Bourbon Kentucky Frontier Whiskey,Vodka
38994,Tasty Treasures With Ocean Fish in Sauce,Chicken & Gibblets Dinner Pate Canned Cat Food


<br>

### 4. Evaluacija priporočilnega sistema (`precision, recall, f1-score`)

In [48]:
recommendations = products_count.merge(frequencies.drop(columns='count'), how='outer').sort_values(by=['user_id', 'count'], ascending=[True, False]).drop(columns='count').reset_index(drop=True)
recommendations.head()

Unnamed: 0,user_id,product_id,recommended
0,21,23729,24852
1,21,48988,24852
2,21,47766,24852
3,21,28204,24852
4,21,44632,24852


In [50]:
recommendations = recommendations.groupby('user_id').apply(lambda x: x.drop_duplicates(subset='recommended', keep='first')).reset_index(drop=True)
recommendations.head()

Unnamed: 0,user_id,product_id,recommended
0,21,23729,24852
1,21,18523,33754
2,21,45603,30776
3,21,24852,21137
4,21,37646,13176


In [54]:
recommendations = recommendations.drop(columns='product_id')

In [59]:
recommendations.head()

Unnamed: 0,user_id,recommended
0,21,24852
1,21,33754
2,21,30776
3,21,21137
4,21,13176


In [75]:
# Ustvarimo tabelo, kjer je za vsakega uporabnika prisotnih le `n` priporočenih izdelkov,
# kjer število `n` predstavlja povprečno število nakupov na naročilo
products_to_recommend = pd.DataFrame()
for user_id, df in recommendations.groupby('user_id'):
    user_basket_size = user_average_basket[user_average_basket['user_id'] == user_id].values[0][3]
    products_to_recommend = products_to_recommend.append(df.head(user_basket_size), ignore_index=True)

products_to_recommend.head()

Unnamed: 0,user_id,recommended
0,21,24852
1,21,33754
2,21,30776
3,21,21137
4,21,13176


In [76]:
bought_products = orders_test[orders_test['eval_set'] == 'train'].merge(all_orders)[['user_id', 'product_id']]
bought_products.head()

Unnamed: 0,user_id,product_id
0,21,25740
1,21,12683
2,21,44632
3,21,10957
4,21,32645


In [77]:
recommendations = products_to_recommend

In [78]:
bought_recommended_products = bought_products.merge(recommendations) # 847349
bought_recommended_products.head()

Unnamed: 0,user_id,product_id,recommended
0,21,25740,24852
1,21,25740,33754
2,21,25740,30776
3,21,25740,21137
4,21,25740,13176


In [84]:
def evaluate(product, product_rec):
    return {'num_of_prod': len(set(product)), 
            'num_of_rec_prod': len(set(product_rec)),
            'num_of_pred_prod': len(set(list(product)).intersection(list(set(product_rec))))}

In [85]:
test_results = bought_recommended_products.groupby('user_id').apply(lambda x: evaluate(x['product_id'], x['recommended']))

In [86]:
test_results = pd.DataFrame(test_results, columns=['dict'])
test_results.head()

Unnamed: 0_level_0,dict
user_id,Unnamed: 1_level_1
21,"{'num_of_prod': 6, 'num_of_rec_prod': 7, 'num_..."
24,"{'num_of_prod': 1, 'num_of_rec_prod': 2, 'num_..."
52,"{'num_of_prod': 11, 'num_of_rec_prod': 7, 'num..."
58,"{'num_of_prod': 2, 'num_of_rec_prod': 7, 'num_..."
65,"{'num_of_prod': 8, 'num_of_rec_prod': 9, 'num_..."


In [87]:
# Zgornji stolpec 'dict' razčlenimo na 3 stolpce
for key in ['num_of_prod', 'num_of_rec_prod', 'num_of_pred_prod']:
    test_results[key] = test_results['dict'].apply(lambda x: x[key])

In [88]:
# 1. stolpec: število izdelkov, ki jih je uporabnik dejansko kupil v svojem zadnjem naročilu
# 2. stolpec: število izdelkov, ki jih je sistem priporočil uporabniku
# 3. stolpec: število izdelkov, ki jih je sistem pravilno napovedal, torej število izdelkov, 
#             ki jih je uporabnik dejansko kupil
test_results = test_results.drop(columns=['dict'])
test_results.head()

Unnamed: 0_level_0,num_of_prod,num_of_rec_prod,num_of_pred_prod
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21,6,7,1
24,1,2,0
52,11,7,1
58,2,7,0
65,8,9,0


In [89]:
stats = test_results.sum()
stats

num_of_prod         218642
num_of_rec_prod     160091
num_of_pred_prod     15543
dtype: int64

In [90]:
precision = stats[2] / stats[1]
recall = stats[2] / stats[0]
f1_score = 2 * (precision * recall) / (precision + recall)

In [91]:
print('precision: {},\nrecall: {},\nf1-score: {}'.format(precision, recall, f1_score))

precision: 0.09708853089805174,
recall: 0.07108881184767794,
f1-score: 0.08207893159560957
