# KNN Model

### Import Needed Packages

In [18]:
# General
import random
import os
import numpy as np
import pandas as pd
# import pandas_profiling

# warning
import warnings
warnings.filterwarnings("ignore")

# display, plots
from IPython.display import display_html
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
import plotly.express as px


#knn systems 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score,roc_auc_score

### Load Data info Dataframes

In [19]:
articles = pd.read_csv('articles.csv')
customers = pd.read_csv('customers.csv')
transactions = pd.read_csv('transactions_2019_interaction.csv')

In [20]:
print('Size of articles metadata', articles.shape)
# print('Size of customers metadata data', customers.shape)
print('Size of user-articles transactions training data', transactions.shape)

Size of articles metadata (105542, 25)
Size of user-articles transactions training data (14089417, 3)


### Randomly Select Customers for Training

In [21]:
# randomly select x customers for training data
rand_cust = random.sample(list(transactions['customer_id'].unique()), k=100000)
transactions = transactions[transactions['customer_id'].isin(
               rand_cust)]

### Merge Transaction and Article Data

In [22]:
#Merged transaction and article data
transactions_w_articles = pd.merge(transactions, articles, on=['article_id'], how='left')

### Model 1 - Cust ID, Article ID and Cust Art Int

In [23]:
#narrow down 
transactions_w_articles_a = transactions_w_articles[['customer_id','article_id','cust_art_int']]

#model fails using customer id that is alphanumeric.. so converting customer ids to UUID for unique ids
import uuid
original_ids = transactions_w_articles_a['customer_id'].unique()
new_ids = {cid: uuid.uuid4().int for cid in original_ids}
transactions_w_articles_a['customer_id'] = transactions_w_articles_a['customer_id'].map(new_ids)

x = transactions_w_articles_a[['customer_id','cust_art_int']]

y = transactions_w_articles_a['article_id']

In [25]:
knn = KNeighborsClassifier(n_neighbors=12,metric='minkowski')

In [26]:
knn.fit(x[:1000000], y[:1000000])

KNeighborsClassifier(n_neighbors=12)

In [27]:
#Compute accuracy on the training set
train_accuracy = knn.score(x[:1000000], y[:1000000])
    
#Compute accuracy on the test set
test_accuracy = knn.score(x[1000000:], y[1000000:]) 

In [28]:
print(train_accuracy)

0.0472


In [29]:
print(test_accuracy)

0.0001604461759574795


In [30]:
y_pred = knn.predict(x[1000000:])

In [32]:
precision_score(y[1000000:],y_pred, average= 'macro')

1.4248251570878094e-05

In [33]:
recall_score(y[1000000:],y_pred, average= 'macro')

2.1148547330737304e-05

### Model 2 - Cust ID, Article ID, Cust Art Int and Product Type

In [37]:
transactions_w_articles_b = transactions_w_articles[['customer_id','article_id','cust_art_int','product_type_no']]

#model fails using customer id that is alphanumeric.. so converting customer ids to UUID for unique ids
import uuid
original_ids = transactions_w_articles_b['customer_id'].unique()
new_ids = {cid: uuid.uuid4().int for cid in original_ids}
transactions_w_articles_b['customer_id'] = transactions_w_articles_b['customer_id'].map(new_ids)

x = transactions_w_articles_b[['customer_id','cust_art_int','product_type_no']]

y = transactions_w_articles_b['article_id']

In [38]:
knn.fit(x[:1000000], y[:1000000])

KNeighborsClassifier(n_neighbors=12)

In [39]:
    #Compute accuracy on the training set
train_accuracy = knn.score(x[:1000000], y[:1000000])
    
    #Compute accuracy on the test set
test_accuracy = knn.score(x[1000000:], y[1000000:]) 

In [40]:
print(train_accuracy)

0.083503


In [41]:
print(test_accuracy)

0.00018304422890923718


In [42]:
y_pred = knn.predict(x[1000000:])

In [43]:
precision_score(y[1000000:],y_pred, average= 'macro')

5.809471472582997e-06

In [44]:
recall_score(y[1000000:],y_pred, average= 'macro')

2.4563481952458243e-05