### 1. Uvoz in filtriranje podatkov

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
orders = pd.read_csv('../data/orders.csv')[['order_id', 'user_id', 'eval_set', 'order_number']]
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number
0,2539329,1,prior,1
1,2398795,1,prior,2
2,473747,1,prior,3
3,2254736,1,prior,4
4,431534,1,prior,5


In [3]:
order_products_prior = pd.read_csv('../data/order_products__prior.csv')[['order_id', 'product_id']]
order_products_prior.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [4]:
order_products_train = pd.read_csv('../data/order_products__train.csv')[['order_id', 'product_id']]
order_products_train.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [5]:
len(order_products_prior) + len(order_products_train)

33819106

In [6]:
# Zadnje naročilo pripada množici `test`, ki za našo implementacijo ni relavantna,
# saj je ta del tekmovanja Kaggla
orders[orders['user_id'] == 3]

Unnamed: 0,order_id,user_id,eval_set,order_number
26,1374495,3,prior,1
27,444309,3,prior,2
28,3002854,3,prior,3
29,2037211,3,prior,4
30,2710558,3,prior,5
31,1972919,3,prior,6
32,1839752,3,prior,7
33,3225766,3,prior,8
34,3160850,3,prior,9
35,676467,3,prior,10


In [7]:
# Vidimo, da naročilo `test` ni prisotno v nobeni tabeli...
order_products_prior[order_products_prior['order_id'] == 2774568]

Unnamed: 0,order_id,product_id


In [8]:
order_products_train[order_products_train['order_id'] == 2774568]

Unnamed: 0,order_id,product_id


In [9]:
# ... zato ta naročila izločimo
orders = orders[orders['eval_set'] != 'test']

In [11]:
# Vsa zadnja naročila, kjer je ponekod potrebno oznako `prior` spremeniti na `train`
# Istočasno je potrebno ta naročila prestaviti iz tabele `order_products_prior` v tabelo `order_products_train`
last_orders = orders.groupby('user_id').tail(1)

In [12]:
orders['eval_set'] = orders['order_id'].isin(last_orders['order_id']).replace({True: 'train', False: 'prior'})

In [14]:
# Omenjena naročila (spodaj) je potrebno prenesti
orders_to_move = last_orders['order_id']
orders_to_move.head()

10    1187899
25    1492625
37    1402502
43    2557754
49    2196797
Name: order_id, dtype: int64

In [15]:
# Premakniti je potrebno 206 209 naročil
len(orders_to_move)

206209

In [16]:
df_to_move = order_products_prior[order_products_prior['order_id'].isin(orders_to_move)]
df_to_move.head()

Unnamed: 0,order_id,product_id
404,51,30274
405,51,3594
406,51,14994
407,51,45433
408,51,44514


In [17]:
# Zgornjo tabelo dodamo tabeli `order_products_train`...
order_products_train = order_products_train.append(df_to_move, ignore_index=True)

In [18]:
# ... in jo odstranimo iz tabele `order_products_prior`
order_products_prior = order_products_prior[~order_products_prior['order_id'].isin(orders_to_move)]

In [19]:
del orders_to_move, df_to_move

In [20]:
# Vsota velikosti obeh tabel je enaka začetni
len(order_products_prior) + len(order_products_train)

33819106

<br>

### 2. Delitev podatkov na učno in testno množico

In [22]:
# 90 % naročil predstavlja učna množica
# 10 % naročil predstavlja testna množica
orders_prior = orders[orders['eval_set'] == 'prior'].sample(frac=1).reset_index(drop=True)
test_orders = orders[orders['eval_set'] == 'train']
orders_mask = np.random.rand(len(orders_prior)) > 0.1
orders_train = orders_prior[orders_mask]
orders_test = orders_prior[~orders_mask]

In [23]:
len(orders_train) / len(orders_test)

8.990340162460905

In [24]:
del orders_mask, orders

<br>

### 3. Frekvenca nakupa izdelkov za vse uporabnike učne množice

In [27]:
products_count = orders_train.merge(order_products_prior).groupby(['user_id', 'product_id']).size()

In [28]:
products_count = products_count.to_frame('count').reset_index().sort_values(by=['user_id', 'count'], ascending=[True, False]).reset_index(drop=True)

In [29]:
products_count.head()

Unnamed: 0,user_id,product_id,count
0,1,196,10
1,1,12427,10
2,1,10258,9
3,1,25133,8
4,1,13032,3


<br>

### 4. Matrika uporabnik-izdelek

Tabelo `products_count` je potrebno najprej filtrirati (zmanjšati), saj je ta prevelika in bi pri kreiranju tabele `pivot`, ki je potrebna za kreiranje matrike podobnosti izdelkov, povzročila napako `MemoryError`. Iz omenjene tabele bomo odstranili uporabnike, ki so skupno kupili manj kot 300 izdelkov, obenem pa izločili izdelke, ki so bili s strani kupcev kupljeni manj kot 300-krat.

In [31]:
# Filtriranje (izbor) uporabnikov, ki so kupili 300 ali več izdelkov
users_filter = products_count.groupby('user_id')['count'].sum() >= 300
users_filter = users_filter[users_filter]
products_count = products_count[products_count['user_id'].isin(users_filter.index)]

In [32]:
# Filtriranje (izbor) izdelkov, ki so bili kupljeni vsaj 300-krat
products_filter = products_count.groupby('product_id')['product_id'].size() >= 300
products_filter = products_filter[products_filter]
products_count = products_count[products_count['product_id'].isin(products_filter.index)].reset_index(drop=True)

In [33]:
del users_filter, products_filter

In [34]:
# Matrika izdelek-uporabnik
item_user_matrix = products_count.pivot(index='user_id', columns='product_id', values='count').fillna(0)

In [35]:
item_user_matrix

product_id,10,34,45,49,79,95,123,141,148,160,...,49481,49520,49533,49583,49585,49605,49610,49621,49628,49683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0
63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
71,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
206193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
206199,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
206201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [36]:
# Ustvarjanje matrike za bolj učinkovito izvajanje operacij
sparse_matrix = csr_matrix(item_user_matrix)

In [37]:
# Kosinusna podobnost izdelkov
user_similarity = pd.DataFrame(cosine_similarity(sparse_matrix), index=item_user_matrix.index, columns=item_user_matrix.index)

In [38]:
del item_user_matrix, sparse_matrix

In [39]:
user_similarity

user_id,27,50,54,63,71,90,110,133,140,142,...,206108,206117,206124,206126,206165,206174,206193,206199,206201,206208
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.000000,0.045312,0.039330,0.058848,0.061498,0.009428,0.128069,0.007847,0.098690,0.093007,...,0.087310,0.086678,0.078253,0.099437,0.104543,0.056378,0.094092,0.199226,0.015158,0.057192
50,0.045312,1.000000,0.062626,0.215927,0.126449,0.000000,0.041706,0.000000,0.085550,0.194092,...,0.111432,0.236901,0.017933,0.156666,0.035683,0.128179,0.298534,0.000000,0.004707,0.159175
54,0.039330,0.062626,1.000000,0.061553,0.054811,0.000000,0.196894,0.038176,0.101480,0.032172,...,0.166297,0.142453,0.058087,0.081983,0.026299,0.275266,0.066917,0.022861,0.014606,0.085192
63,0.058848,0.215927,0.061553,1.000000,0.100276,0.006070,0.064000,0.002862,0.137047,0.256941,...,0.150942,0.375962,0.109874,0.213927,0.110054,0.164488,0.249250,0.114312,0.012424,0.289960
71,0.061498,0.126449,0.054811,0.100276,1.000000,0.005918,0.072107,0.006046,0.040270,0.236240,...,0.046604,0.101084,0.014211,0.058484,0.004713,0.040094,0.113157,0.099389,0.015747,0.103045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206174,0.056378,0.128179,0.275266,0.164488,0.040094,0.002940,0.188083,0.025231,0.136383,0.136448,...,0.168986,0.226467,0.012238,0.132606,0.112388,1.000000,0.150927,0.017848,0.012353,0.166484
206193,0.094092,0.298534,0.066917,0.249250,0.113157,0.008020,0.117904,0.008193,0.138852,0.218324,...,0.141268,0.349549,0.040656,0.211947,0.053220,0.150927,1.000000,0.010818,0.037439,0.232737
206199,0.199226,0.000000,0.022861,0.114312,0.099389,0.000000,0.108333,0.099036,0.025882,0.053084,...,0.039308,0.015651,0.076756,0.017041,0.031887,0.017848,0.010818,1.000000,0.014758,0.001359
206201,0.015158,0.004707,0.014606,0.012424,0.015747,0.055433,0.024068,0.013411,0.019676,0.005010,...,0.021161,0.020118,0.012260,0.016381,0.039495,0.012353,0.037439,0.014758,1.000000,0.021636


<br>

### 5. Iskanje podobnih uporabnikov

In [40]:
# Funkcija, zasnovana za matriko user_similarity, ki za podanega uporabnika vrne najbolj podobnega uporabnika
def get_similar_user(user_id, n):
    try:
        return list(user_similarity[user_id].drop(labels=[user_id]).sort_values(ascending=False)[:n].index)
    except KeyError:
        # Tega kupca ni v matriki podobnosti
        pass

In [42]:
orders_test.head()

Unnamed: 0,order_id,user_id,eval_set,order_number
3,1559185,134606,prior,2
23,492276,34306,prior,61
28,2853222,78775,prior,32
44,1238941,50525,prior,5
53,3145824,132976,prior,36


In [43]:
# Kupci testne množice
test_users = orders_test['user_id'].unique()
test_users

array([134606,  34306,  78775, ..., 169954, 164753, 167592], dtype=int64)

In [44]:
# Število kupcev v testni množici
len(test_users)

128056

In [45]:
#### Slovar - user_id iz testne množice (test_users): user_id (najbolj podoben uporabnik, pridobljen
# s pomočjo učne množice)
similar_users = {}
for user_id in test_users:
    try:                                   
        similar_user = get_similar_user(user_id, 5)
        if similar_user:
            similar_users[user_id] = similar_user
    except:
        # Ni možno predlagati uporabnika, ker se ta ne nahaja v učni množici
        pass

In [47]:
len(similar_users)

24058

<br>

### 6. Frekvenca pogostosti nakupa izdelkov

In [48]:
products_frequency = products_count.groupby('product_id')['count'].sum().to_frame().sort_values(by='count', ascending=False)
products_frequency.head()

Unnamed: 0_level_0,count
product_id,Unnamed: 1_level_1
24852,198256
13176,167542
21137,130557
47209,107488
21903,102847


<br>

In [49]:
products_count = orders_prior.merge(order_products_prior).groupby(['user_id', 'product_id']).size()
products_count = products_count.to_frame('count').reset_index().sort_values(by=['user_id', 'count'], ascending=[True, False]).reset_index(drop=True)

In [50]:
products_count.head()

Unnamed: 0,user_id,product_id,count
0,1,196,10
1,1,12427,10
2,1,10258,9
3,1,25133,8
4,1,13032,3


In [51]:
# Število izdelkov, ki jih je kupec kupil
num_of_user_products = products_count.groupby('user_id')['count'].sum().to_frame('num_of_products').reset_index()
num_of_user_products.head()

Unnamed: 0,user_id,num_of_products
0,1,59
1,2,195
2,3,82
3,4,15
4,5,37


In [52]:
# Število naročil, ki jih je opravil kupec
num_of_user_orders = orders_prior.groupby('user_id')['order_id'].size().to_frame('num_of_orders').reset_index()
num_of_user_orders.head()

Unnamed: 0,user_id,num_of_orders
0,1,10
1,2,14
2,3,11
3,4,4
4,5,4


In [53]:
# Združitev zgornjih dveh podatkov
user_average_basket = num_of_user_orders.merge(num_of_user_products)
user_average_basket.head()

Unnamed: 0,user_id,num_of_orders,num_of_products
0,1,10,59
1,2,14,195
2,3,11,82
3,4,4,15
4,5,4,37


In [54]:
# Izračun povprečne velikosti košarice za vsakega kupca,
# saj mu bo priporočilni sistem priporočil natanko toliko izdelkov
user_average_basket['average_basket_size'] = np.ceil(user_average_basket['num_of_products'] / user_average_basket['num_of_orders']).astype(np.int64)
user_average_basket.head()

Unnamed: 0,user_id,num_of_orders,num_of_products,average_basket_size
0,1,10,59,6
1,2,14,195,14
2,3,11,82,8
3,4,4,15,4
4,5,4,37,10


### 7. Priporočanje izdelkov

In [58]:
train_orders = orders_prior.merge(order_products_prior)[['user_id', 'order_id', 'product_id']]
train_orders.head()

Unnamed: 0,user_id,order_id,product_id
0,23153,3014821,27845
1,23153,3014821,18362
2,23153,3014821,25659
3,23153,3014821,17794
4,23153,3014821,45007


In [60]:
def get_user_products(user_test_id):
    
    '''
    Parametri
    ---------
    `user_test_id` : int
        id uporabnika testne množice, za katerega napovedujemo nove izdelke
        
    Opis
    ----
    Funkcija, ki za podana parametra, ki soupadata z zgornjim slovarjem,
    poišče vse relavantne izdelke, ki bi jih morda želel kupiti
    kupec `user_test_id`, nato pa uporabi frekvenco pogostosti
    nakupa vseh artiklov in izbere n artiklov, ki so 
    nasplošno najbolj prodajani, med tistimi, ki jih kupec
    še ni kupil
    '''
    
    # Izdelki podobnih uporabnikov
    rec_products = train_orders[train_orders['user_id'] == similar_users[user_test_id][0]]['product_id']
    for user_id in similar_users[user_test_id]:
        rec_products = rec_products.append(train_orders[train_orders['user_id'] == user_id]['product_id'])

    # Izločitev že kupljenih izdelkov
    rec_products = rec_products[~rec_products.isin(train_orders[train_orders['user_id'] == user_test_id]['product_id'])]
    
    # Pridobitev najbolj prodajanih izdelkov
    rec_products = rec_products.to_frame()
    most_freq_bought_products = rec_products.merge(products_frequency, left_on='product_id', right_index=True).sort_values(by='count', ascending=False)
    most_freq_bought_products = most_freq_bought_products.drop_duplicates(subset=['product_id', 'count'], keep='first')
    
    # Preveri povprečno število izdelkov, ki jih kupec kupi na naročilo
    user_test_bought_len = user_average_basket[user_average_basket['user_id'] == user_test_id].values[0][3]
        
    return most_freq_bought_products.head(user_test_bought_len)[['product_id']].reset_index(drop=True)

In [62]:
from IPython.display import clear_output

test_orders = test_orders.merge(order_products_train)

total = len(similar_users)
current = 0
num_of_bought_products = num_of_recommended_products = num_of_predicted_products = 0
for user_id in similar_users:
    bought = test_orders[test_orders['user_id'] == user_id][['product_id']]
    recommended = get_user_products(user_id)
    predicted = bought.merge(recommended)
    
    num_of_bought_products += len(bought)
    num_of_recommended_products += len(recommended)
    num_of_predicted_products += len(predicted)
    
    clear_output()
    current += 1
    print('{} / {}'.format(current, total))
    
print('bought: {},\nrecommended: {},\npredicted: {}'.format(num_of_bought_products, num_of_recommended_products, num_of_predicted_products))

24058 / 24058
bought: 365249,
recommended: 353316,
predicted: 3222


In [63]:
precision = num_of_predicted_products / num_of_recommended_products
recall = num_of_predicted_products / num_of_bought_products
f1_score = 2 * (precision * recall) / (precision + recall)

In [64]:
print('precision: {},\nrecall: {},\nf1-score: {}'.format(precision, recall, f1_score))

precision: 0.00911931528716503,
recall: 0.00882137938776013,
f1-score: 0.008967873470040983
