In [31]:
!jupyter nbconvert --to script model.ipynb


[NbConvertApp] Converting notebook model.ipynb to script
[NbConvertApp] Writing 1967 bytes to model.py


In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
articles = pd.read_csv('shared_articles.csv')
users = pd.read_csv('users_interactions.csv')

In [3]:
users['eventType'].unique()
rating_map = {"VIEW": 1, "LIKE": 2, "FOLLOW": 3, "BOOKMARK": 4, "COMMENT CREATED": 5}
users["rating"] = users["eventType"].map(rating_map)

In [4]:
merged = pd.merge(right=articles, left=users, right_on='contentId', left_on='contentId', how='left')

In [5]:
triple = merged[['personId', 'contentId', 'rating']]

In [6]:
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split


triple['mappingPerson'] = triple['personId'].astype('category').cat.codes
triple['mappingcontent'] = triple['contentId'].astype('category').cat.codes


train, test = train_test_split(triple, test_size=0.2, random_state=42)

sparse_matrix = coo_matrix((train['rating'], (train['mappingPerson'], train['mappingcontent'])))
sparse_test = coo_matrix((test['rating'], (test['mappingPerson'], test['mappingcontent'])))


In [7]:
personMapping = {row.mappingPerson:row.personId for row in triple.itertuples()}
contentMapping = {row.mappingcontent:row.contentId for row in triple.itertuples()}
test.drop(columns=['personId', 'contentId'], inplace=True)

In [32]:
personMapping

{33: -8845298781299428018,
 831: -1032019229384696495,
 826: -1130272294246983140,
 974: 344280948527967603,
 887: -445337111692715325,
 42: -8763398617720485024,
 1319: 3609194402293569455,
 1390: 4254153380739593270,
 1147: 1908339160857512799,
 1739: 7781822014935525018,
 1780: 8239286975497580612,
 1225: 2766187446275090740,
 796: -1479311724257856983,
 926: -108842214936804958,
 1658: 7022645187549453002,
 1643: 6879394870211872116,
 804: -1387464358334758758,
 81: -8424644554119645763,
 782: -1602833675167376798,
 1511: 5621833459783231486,
 682: -2626634673110551643,
 306: -6153009241569363021,
 478: -4585796377251906117,
 715: -2339455719722814827,
 1398: 4340306774493623681,
 1522: 5683029675627635125,
 798: -1443636648652872475,
 1353: 3891637997717104548,
 737: -2050699458865052139,
 1301: 3429602690322213789,
 671: -2772844562500836582,
 940: 22763587941636338,
 127: -7990997793599977496,
 1265: 3094513233385472738,
 223: -6998647087289883231,
 1840: 8766802480854827422,
 1

In [33]:
{row.personId:row.mappingPerson for row in triple.itertuples()}

{-8845298781299428018: 33,
 -1032019229384696495: 831,
 -1130272294246983140: 826,
 344280948527967603: 974,
 -445337111692715325: 887,
 -8763398617720485024: 42,
 3609194402293569455: 1319,
 4254153380739593270: 1390,
 1908339160857512799: 1147,
 7781822014935525018: 1739,
 8239286975497580612: 1780,
 2766187446275090740: 1225,
 -1479311724257856983: 796,
 -108842214936804958: 926,
 7022645187549453002: 1658,
 6879394870211872116: 1643,
 -1387464358334758758: 804,
 -8424644554119645763: 81,
 -1602833675167376798: 782,
 5621833459783231486: 1511,
 -2626634673110551643: 682,
 -6153009241569363021: 306,
 -4585796377251906117: 478,
 -2339455719722814827: 715,
 4340306774493623681: 1398,
 5683029675627635125: 1522,
 -1443636648652872475: 798,
 3891637997717104548: 1353,
 -2050699458865052139: 737,
 3429602690322213789: 1301,
 -2772844562500836582: 671,
 22763587941636338: 940,
 -7990997793599977496: 127,
 3094513233385472738: 1265,
 -6998647087289883231: 223,
 8766802480854827422: 1840,
 5

In [8]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='auto')
knn_model.fit(sparse_matrix)

In [None]:
knn_model

In [30]:
vars(sparse_test.getrow(22))

{'_shape': (1, 2987),
 'maxprint': 50,
 'indices': array([2797, 1466]),
 'indptr': array([0, 2]),
 'data': array([2, 2], dtype=int64)}

In [16]:
distance, indices = knn_model.kneighbors(sparse_test.getrow(22), n_neighbors=5)

In [17]:
reccs = [contentMapping[content] for content in indices[0]]

In [18]:
for rec in reccs:
    print(articles[articles['contentId'] == rec]['title'])

122    Microsoft: Windows Phone is not the place to d...
Name: title, dtype: object
71    Drupal How-To: Responsive or Adaptive Images? ...
Name: title, dtype: object
174    How Machine Learning Will Change Our Relations...
Name: title, dtype: object
1209    How Apple Keyboards Lost a Logo and Windows PC...
Name: title, dtype: object
671    The Skills You'll Need and the Salary You Can ...
Name: title, dtype: object


In [19]:
import joblib

joblib.dump(knn_model, 'knn_model.sav')

['knn_model.sav']