### 1. Uvoz in filtriranje podatkov

In [97]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
orders = pd.read_csv('../data/orders.csv')[['order_id', 'user_id', 'eval_set', 'order_number']]
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number
0,2539329,1,prior,1
1,2398795,1,prior,2
2,473747,1,prior,3
3,2254736,1,prior,4
4,431534,1,prior,5


In [3]:
order_products_prior = pd.read_csv('../data/order_products__prior.csv')[['order_id', 'product_id']]
order_products_prior.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [4]:
order_products_train = pd.read_csv('../data/order_products__train.csv')[['order_id', 'product_id']]
order_products_train.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [5]:
len(order_products_prior) + len(order_products_train)

33819106

In [6]:
# Zadnje naročilo pripada množici `test`, ki za našo implementacijo ni relavantna,
# saj je ta del tekmovanja Kaggla
orders[orders['user_id'] == 3]

Unnamed: 0,order_id,user_id,eval_set,order_number
26,1374495,3,prior,1
27,444309,3,prior,2
28,3002854,3,prior,3
29,2037211,3,prior,4
30,2710558,3,prior,5
31,1972919,3,prior,6
32,1839752,3,prior,7
33,3225766,3,prior,8
34,3160850,3,prior,9
35,676467,3,prior,10


In [7]:
# Vidimo, da naročilo `test` ni prisotno v nobeni tabeli...
order_products_prior[order_products_prior['order_id'] == 2774568]

Unnamed: 0,order_id,product_id


In [8]:
order_products_train[order_products_train['order_id'] == 2774568]

Unnamed: 0,order_id,product_id


In [9]:
# ... zato ta naročila izločimo
orders = orders[orders['eval_set'] != 'test']

In [10]:
orders[orders['user_id'] == 3]

Unnamed: 0,order_id,user_id,eval_set,order_number
26,1374495,3,prior,1
27,444309,3,prior,2
28,3002854,3,prior,3
29,2037211,3,prior,4
30,2710558,3,prior,5
31,1972919,3,prior,6
32,1839752,3,prior,7
33,3225766,3,prior,8
34,3160850,3,prior,9
35,676467,3,prior,10


In [11]:
# Vsa zadnja naročila, kjer je ponekod potrebno oznako `prior` spremeniti na `train`
# Istočasno je potrebno ta naročila prestaviti iz tabele `order_products_prior` v tabelo `order_products_train`
last_orders = orders.groupby('user_id').tail(1)

In [12]:
orders['eval_set'] = orders['order_id'].isin(last_orders['order_id']).replace({True: 'train', False: 'prior'})

In [13]:
orders[orders['user_id'] == 3]

Unnamed: 0,order_id,user_id,eval_set,order_number
26,1374495,3,prior,1
27,444309,3,prior,2
28,3002854,3,prior,3
29,2037211,3,prior,4
30,2710558,3,prior,5
31,1972919,3,prior,6
32,1839752,3,prior,7
33,3225766,3,prior,8
34,3160850,3,prior,9
35,676467,3,prior,10


In [14]:
# Omenjena naročila (spodaj) je potrebno prenesti
orders_to_move = last_orders['order_id']
orders_to_move.head()

10    1187899
25    1492625
37    1402502
43    2557754
49    2196797
Name: order_id, dtype: int64

In [15]:
# Premakniti je potrebno 206 209 naročil
len(orders_to_move)

206209

In [16]:
df_to_move = order_products_prior[order_products_prior['order_id'].isin(orders_to_move)]
df_to_move.head()

Unnamed: 0,order_id,product_id
404,51,30274
405,51,3594
406,51,14994
407,51,45433
408,51,44514


In [17]:
# Zgornjo tabelo dodamo tabeli `order_products_train`...
order_products_train = order_products_train.append(df_to_move, ignore_index=True)

In [18]:
# ... in jo odstranimo iz tabele `order_products_prior`
order_products_prior = order_products_prior[~order_products_prior['order_id'].isin(orders_to_move)]

In [19]:
del orders_to_move, df_to_move

In [20]:
# Vsota velikosti obeh tabel je enaka začetni
len(order_products_prior) + len(order_products_train)

33819106

<br>

### 2. Delitev podatkov na učno in testno množico

In [21]:
# 90 % uporabnikov predstavlja učna množica
# 10 % uporabnikov predstavlja testna množica
users = orders['user_id'].unique()
users_mask = np.random.rand(len(users)) > 0.1
orders_train = orders[orders['user_id'].isin(users[users_mask])]
orders_test = orders[orders['user_id'].isin(users[~users_mask])]

In [22]:
del orders, users, users_mask

In [23]:
orders_train[orders_train['user_id'] == 1]

Unnamed: 0,order_id,user_id,eval_set,order_number
0,2539329,1,prior,1
1,2398795,1,prior,2
2,473747,1,prior,3
3,2254736,1,prior,4
4,431534,1,prior,5
5,3367565,1,prior,6
6,550135,1,prior,7
7,3108588,1,prior,8
8,2295261,1,prior,9
9,2550362,1,prior,10


In [24]:
len(orders_train) / len(orders_test)

9.09215117944944

<br>

### 3. Frekvenca nakupa izdelkov za vse uproabnike učne množice

In [25]:
products_count = orders_train.merge(order_products_prior.merge(order_products_train, how='outer')).groupby(['user_id', 'product_id']).size()

In [26]:
products_count = products_count.to_frame('count').reset_index().sort_values(by=['user_id', 'count'], ascending=[True, False]).reset_index(drop=True)

In [27]:
products_count.head()

Unnamed: 0,user_id,product_id,count
0,1,196,11
1,1,10258,10
2,1,12427,10
3,1,25133,9
4,1,13032,4


<br>

### 4. Matrika izdelek-uporabnik

Tabelo `products_count` je potrebno najprej filtrirati (zmanjšati), saj je ta prevelika in bi pri kreiranju tabele `pivot`, ki je potrebna za kreiranje matrike podobnosti izdelkov, povzročila napako `MemoryError`. Iz omenjene tabele bomo odstranili uporabnike, ki so skupno kupili manj kot 100 izdelkov, obenem pa izločili izdelke, ki so bili s strani kupcev kupljeni manj kot 100-krat.

In [29]:
# Filtriranje (izbor) uporabnikov, ki so kupili 100 ali več izdelkov
users_filter = products_count.groupby('user_id')['count'].sum() >= 100
users_filter = users_filter[users_filter]
products_count = products_count[products_count['user_id'].isin(users_filter.index)]

In [30]:
# Filtriranje (izbor) izdelkov, ki so bili kupljeni vsaj 100-krat
products_filter = products_count.groupby('product_id')['product_id'].size() >= 100
products_filter = products_filter[products_filter]
products_count = products_count[products_count['product_id'].isin(products_filter.index)].reset_index(drop=True)

In [31]:
del users_filter, products_filter

In [32]:
# Matrika izdelek-uporabnik
item_user_matrix = products_count.pivot(index='product_id', columns='user_id', values='count').fillna(0)

In [33]:
item_user_matrix

user_id,2,7,10,14,19,21,27,29,31,35,...,206197,206199,206200,206201,206202,206203,206206,206207,206208,206209
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Ustvarjanje matrike za bolj učinkovito izvajanje operacij
sparse_matrix = csr_matrix(item_user_matrix)

In [35]:
# Normalizacija matrike
sparse_matrix_norm = (sparse_matrix - sparse_matrix.min()) / (sparse_matrix.max() - sparse_matrix.min())

In [36]:
# Kosinusna podobnost izdelkov
product_similarity = pd.DataFrame(cosine_similarity(sparse_matrix_norm), index=item_user_matrix.index, columns=item_user_matrix.index)

In [37]:
del item_user_matrix, sparse_matrix, sparse_matrix_norm

In [38]:
product_similarity

product_id,1,4,10,23,25,28,32,34,35,37,...,49647,49652,49655,49659,49667,49668,49677,49678,49680,49683
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000139,0.000000,0.000398,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000576,0.002870
4,0.000000,1.000000,0.000000,0.004822,0.000000,0.001102,0.000000,0.000896,0.000000,0.000641,...,0.000000,0.000000,0.000000,0.000000,0.000644,0.000000,0.000000,0.000000,0.002471,0.010244
10,0.000000,0.000000,1.000000,0.003249,0.016246,0.000167,0.000137,0.026180,0.001071,0.002719,...,0.000000,0.006003,0.002983,0.001224,0.001171,0.000000,0.000856,0.000000,0.006368,0.020809
23,0.000000,0.004822,0.003249,1.000000,0.002703,0.000328,0.000268,0.007914,0.002103,0.000000,...,0.001233,0.008511,0.003390,0.006006,0.002682,0.000000,0.000000,0.000000,0.001471,0.031112
25,0.000000,0.000000,0.016246,0.002703,1.000000,0.003751,0.001706,0.014773,0.020498,0.000000,...,0.000000,0.001110,0.002351,0.005091,0.000244,0.000770,0.000000,0.000000,0.001402,0.018019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49668,0.000000,0.000000,0.000000,0.000000,0.000770,0.000000,0.000000,0.005777,0.000000,0.000869,...,0.001405,0.000000,0.000351,0.000000,0.000000,1.000000,0.000000,0.001504,0.006705,0.002158
49677,0.000000,0.000000,0.000856,0.000000,0.000000,0.000000,0.000000,0.003513,0.000000,0.007031,...,0.003248,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.013899,0.002905,0.010799
49678,0.000000,0.000000,0.000000,0.000000,0.000000,0.016274,0.001110,0.000000,0.000000,0.007099,...,0.000000,0.000000,0.000637,0.000000,0.000000,0.001504,0.013899,1.000000,0.000760,0.012268
49680,0.000576,0.002471,0.006368,0.001471,0.001402,0.000000,0.000000,0.009535,0.000000,0.001759,...,0.000711,0.005284,0.000355,0.000000,0.003534,0.006705,0.002905,0.000760,1.000000,0.034072


<br>

### 5. Kreiranje tabele produktov

In [39]:
products = pd.read_csv('../data/products.csv')[['product_id', 'product_name']]
products.head()

Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies
1,2,All-Seasons Salt
2,3,Robust Golden Unsweetened Oolong Tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...
4,5,Green Chile Anytime Sauce


In [40]:
# Izberemo le izdelke, ki so prisotni v matriki `product_similarity`, izdelki pa so postavljeni istoležno z indeksi
products = products.set_index('product_id').loc[product_similarity.index].reset_index()
products.head()

Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies
1,4,Smart Ones Classic Favorites Mini Rigatoni Wit...
2,10,Sparkling Orange Juice & Prickly Pear Beverage
3,23,Organic Turkey Burgers
4,25,Salted Caramel Lean Protein & Fiber Bar


<br>

#### 5.1 Preizkus tabele podobnosti izdelkov `product_similarity`

In [41]:
products[products['product_id'] == 1]

Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies


In [42]:
def recommend_similar_products(product_id, n):
    '''
    Opis
    ----
    Funkcija za podan argument `product_id` in število
    priporočenih izdelkov `n` vrne tabelo podobnih izdelkov `similar_products`
    
    Parametra
    ---------
    `product_id` : int
        enolična številka izdelka, az katerega želimo generirati podobne izdelke
    `n` : int
        število podobnih izdelkov, ki jih želimo generirati
        
    Vrne
    ----
    'similar_products' : DataFrame
        tabela podobnih izdelkov izdelku z enolično številko `product_id`
    '''
    
    try:
        similar_products = pd.DataFrame(product_similarity[product_id].drop(product_id).sort_values(ascending=False)[:n+1]).merge(products, left_index=True, right_on='product_id', how='inner').reset_index(drop=True)
        return similar_products.rename(columns={product_id: 'similarity'})[['product_id', 'product_name', 'similarity']]
    except KeyError:
        print('Tega produkta žal ni v matriki podobnosti.')

In [43]:
recommend_similar_products(1, 10)

Unnamed: 0,product_id,product_name,similarity
0,5971,Salt & Pepper Pistachios,0.225867
1,19972,Semi-Sweet Chocolate Morsels,0.217441
2,7987,Hazelnut Spread with Cocoa,0.189342
3,11759,Organic Simply Naked Pita Chips,0.181792
4,21572,Cheez-It Cheddar Cracker,0.179387
5,39657,Milk Chocolate Almonds,0.175095
6,10441,Dry Roasted Almonds,0.157294
7,32455,Whole Grain Cheddar Baked Snack Crackers,0.144142
8,41803,Uncrustables Peanut Butter & Strawberry Jam Sa...,0.142619
9,16974,Sea Salt Brown Rice Crackers,0.135169


<br>

### 6. Frekvenca nakupa izdelkov za vse uproabnike testne množice

In [44]:
products_count = orders_test[orders_test['eval_set'] == 'prior'].merge(order_products_prior).groupby(['user_id', 'product_id']).size()

In [45]:
products_count = products_count.to_frame('count').reset_index().sort_values(by=['user_id', 'count'], ascending=[True, False]).reset_index(drop=True)

In [47]:
# Število naročil, ki jih je opravil kupec
num_of_user_orders = orders_test[orders_test['eval_set'] == 'prior'].groupby('user_id')['order_id'].size().to_frame('num_of_orders').reset_index()

In [48]:
# Število izdelkov, ki jih je kupec kupil
num_of_user_products = products_count.groupby('user_id')['count'].sum().to_frame('num_of_products').reset_index()

In [49]:
# Združitev zgornjih dveh podatkov
user_average_basket = num_of_user_orders.merge(num_of_user_products)

In [50]:
user_average_basket.head()

Unnamed: 0,user_id,num_of_orders,num_of_products
0,13,12,81
1,17,40,294
2,24,18,38
3,28,23,168
4,32,4,92


In [51]:
# Izračun povprečne velikosti košarice za vsakega kupca,
# saj mu bo priporočilni sistem priporočil natanko toliko izdelkov
user_average_basket['average_basket_size'] = np.ceil(user_average_basket['num_of_products'] / user_average_basket['num_of_orders']).astype(np.int64)

In [52]:
user_average_basket

Unnamed: 0,user_id,num_of_orders,num_of_products,average_basket_size
0,13,12,81,7
1,17,40,294,8
2,24,18,38,3
3,28,23,168,8
4,32,4,92,23
...,...,...,...,...
20505,206161,14,131,10
20506,206164,5,5,1
20507,206169,8,39,5
20508,206181,14,127,10


In [53]:
# Na koncu v tabeli `products_count` odstranimo izdelke, ki jih ni v matriki podobnosti
products_count = products_count[products_count['product_id'].isin(product_similarity.index)]

In [54]:
products_count.head()

Unnamed: 0,user_id,product_id,count
0,13,27086,12
1,13,4210,11
2,13,27435,9
3,13,1689,5
4,13,33735,5


<br>

### 7. Priporočanje novih izdelkov

In [80]:
def recommend_similar_product(product_id, n):
    '''
    Opis
    ----
    Funkcija za podan argument izdelka z enolično številko `product_id` 
    vrne `n` dolg seznam enoličnih številk izdelkov, ki so temu najbolj podobni
    
    Parameter
    ---------
    `product_id` : int
        enolična številka izdelka, za katerega želimo generirati najbolj podobne izdelke
    `n` : int
        število podobnih izdelkov, ki jih želimo generirati
        
    Vrne
    ----
    seznam noličnih številk najbolj podobnih izdekov izdelku z enolično številko `product_id`
    '''
    
    return list(product_similarity[product_id].sort_values(ascending=False).drop(product_id)[:n].index)

In [58]:
# Ustvarimo tabelo, kjer je za vsakega uporabnika prisotnih le `n` njegovih najbolj kupljenih izdelkov,
# kjer število `n` predstavlja povprečno število nakupov na naročilo
products_to_process = pd.DataFrame()
for user_id, df in products_count.groupby('user_id'):
    user_basket_size = user_average_basket[user_average_basket['user_id'] == user_id].values[0][3]
    products_to_process = products_to_process.append(df.head(user_basket_size), ignore_index=True)
products_to_process.head()

Unnamed: 0,user_id,product_id,count
0,13,27086,12
1,13,4210,11
2,13,27435,9
3,13,1689,5
4,13,33735,5


In [59]:
# Izdelki, za katere bomo generirali priporočila
test_products = products_to_process['product_id'].unique()
len(test_products)

11039

In [201]:
# `product_id` iz testne množice (test_products): `product_id` (priporočeni izdelek)
product_recommendations = {}
for product_id in test_products:                                       
    product_recommendations[product_id] = recommend_similar_product(product_id, 10)

In [202]:
recommendations = pd.DataFrame(list(product_recommendations.items()), columns=['product_id', 'product_rec_id_list'])

In [203]:
recommendations.head()

Unnamed: 0,product_id,product_rec_id_list
0,27086,"[24852, 5785, 44632, 21137, 26209, 13176, 4720..."
1,4210,"[24852, 44799, 39407, 19511, 11256, 23909, 193..."
2,27435,"[31802, 5314, 41557, 7533, 2713, 45116, 21351,..."
3,1689,"[43721, 21573, 15680, 21385, 11759, 38928, 268..."
4,33735,"[17634, 43205, 45427, 21169, 36180, 17931, 591..."


In [204]:
recommendations = recommendations.merge(products_to_process)

In [205]:
recommendations.head()

Unnamed: 0,product_id,product_rec_id_list,user_id,count
0,27086,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",13,12
1,27086,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",523,2
2,27086,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",729,49
3,27086,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",1449,3
4,27086,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",1782,2


In [206]:
recommendations = recommendations.drop(columns='product_id')

In [207]:
recommendations.head()

Unnamed: 0,product_rec_id_list,user_id,count
0,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",13,12
1,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",523,2
2,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",729,49
3,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",1449,3
4,"[24852, 5785, 44632, 21137, 26209, 13176, 4720...",1782,2


In [208]:
def recommend_products(user_id, product_rec_id_list):
    user_bought = Counter(product_rec_id_list.sum()).most_common(user_average_basket[user_average_basket['user_id'] == user_id[0]].values[0][3])
    return pd.DataFrame({'product_rec_id': [x[0] for x in user_bought]})

In [209]:
recommendations = recommendations.groupby('user_id').apply(lambda x: recommend_products(x['user_id'], x['product_rec_id_list'])).reset_index()[['user_id', 'product_rec_id']]

In [210]:
recommendations.head()

Unnamed: 0,user_id,product_rec_id
0,13,45116
1,13,24852
2,13,21137
3,13,13176
4,13,19638


In [211]:
# Enako strorimo še z dejansko kupljenimi izdelki uporabnikov v njihovem zadnjem naročilu
# 20876
bought_products = orders_test[orders_test['eval_set'] == 'train'][['user_id', 'order_id']].reset_index(drop=True).merge(order_products_train)[['user_id', 'product_id']]

In [212]:
bought_products.head()

Unnamed: 0,user_id,product_id
0,13,27435
1,13,27086
2,13,4210
3,13,47078
4,13,19934


In [213]:
bought_recommended_products = bought_products.merge(recommendations)

In [214]:
bought_recommended_products.head()

Unnamed: 0,user_id,product_id,product_rec_id
0,13,27435,45116
1,13,27435,24852
2,13,27435,21137
3,13,27435,13176
4,13,27435,19638


<br>

### 8. Evaluacija priporočilnega sistema

In [215]:
def evaluate(product, product_rec):
    return {'num_of_prod': len(set(product)), 
            'num_of_rec_prod': len(set(product_rec)),
            'num_of_pred_prod': len(set(list(product)).intersection(list(set(product_rec))))}

In [216]:
test_results = bought_recommended_products.groupby('user_id').apply(lambda x: evaluate(x['product_id'], x['product_rec_id']))

In [217]:
test_results = pd.DataFrame(test_results, columns=['dict'])
test_results.head()

Unnamed: 0_level_0,dict
user_id,Unnamed: 1_level_1
13,"{'num_of_prod': 5, 'num_of_rec_prod': 7, 'num_..."
17,"{'num_of_prod': 6, 'num_of_rec_prod': 8, 'num_..."
24,"{'num_of_prod': 1, 'num_of_rec_prod': 3, 'num_..."
28,"{'num_of_prod': 15, 'num_of_rec_prod': 8, 'num..."
32,"{'num_of_prod': 22, 'num_of_rec_prod': 23, 'nu..."


In [218]:
# Zgornji stolpec 'dict' razčlenimo na 3 stolpce
for key in ['num_of_prod', 'num_of_rec_prod', 'num_of_pred_prod']:
    test_results[key] = test_results['dict'].apply(lambda x: x[key])

In [219]:
# 1. stolpec: število izdelkov, ki jih je uporabnik dejansko kupil v svojem zadnjem naročilu
# 2. stolpec: število izdelkov, ki jih je sistem priporočil uporabniku
# 3. stolpec: število izdelkov, ki jih je sistem pravilno napovedal, torej število izdelkov, 
#             ki jih je uporabnik dejansko kupil
test_results = test_results.drop(columns=['dict'])
test_results.head()

Unnamed: 0_level_0,num_of_prod,num_of_rec_prod,num_of_pred_prod
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
13,5,7,0
17,6,8,0
24,1,3,0
28,15,8,1
32,22,23,3


In [220]:
stats = test_results.sum()
stats

num_of_prod         213405
num_of_rec_prod     212485
num_of_pred_prod     17320
dtype: int64

In [221]:
precision = stats[2] / stats[1]
recall = stats[2] / stats[0]
f1_score = 2 * (precision * recall) / (precision + recall)

In [222]:
print('precision: {},\nrecall: {},\nf1-score: {}'.format(precision, recall, f1_score))

precision: 0.0815116361154905,
recall: 0.08116023523347625,
f1-score: 0.08133555612951701
