In [1]:
import time
import random
import numpy as np
import pandas as pd
import datetime as dt


In [2]:
## load data
raw_data = pd.read_csv('./yoochoose-buys.dat', sep=',', \
                   header=None, usecols=[0,1,2], dtype={0:np.int32, 1:str, 2:np.int64})
raw_data.columns = ['SessionId', 'TimeStamp', 'ItemId']
raw_data.shape

(1150753, 3)

In [3]:
# time stamp 버리고 
dropped_timestamp = raw_data.drop('TimeStamp',axis=1)
# session id & item id 쌍 중복 제거 
dropped_duplicates = dropped_timestamp.drop_duplicates()

In [4]:
len(dropped_duplicates)

In [16]:
# data : 중복 제거된 data 
# length : 길이 몇 이상인 데이터로 할 것 인지 
def filter_session(data,length):
    session_lists = list(data['SessionId'].value_counts().index)
    filtered_df = pd.DataFrame()
    for user in session_lists:
        filtered_user_df = data[data['SessionId']==user]
        if len(filtered_user_df) > length :
            filtered_df = pd.concat([filtered_df,filtered_user_df])
    return filtered_df 

In [None]:
filtered_df = filter_session(dropped_duplicates,4)

In [15]:
filtered_df

Unnamed: 0,SessionId,ItemId
576452,6149111,214848373
576453,6149111,214848384
576454,6149111,214835167
576455,6149111,214829765
576456,6149111,214848926
...,...,...
990380,10087141,214854547
990381,10087141,214854300
990382,10087141,214854585
990383,10087141,214854579


In [17]:
session_id = np.array(filtered_df['SessionId'])
item_id = np.array(filtered_df['ItemId'])

In [20]:
from scipy import sparse as sp
# session_item_spm = sp.coo_matrix((np.ones((len(session_id),)), (session_id, item_id)))
session_item_spm = sp.csr_matrix((np.ones((len(session_id),)), (session_id, item_id)))
num_users, num_items = session_item_spm.shape

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(session_item_spm):
    item_spm = cosine_similarity(session_item_spm.transpose(),dense_output=False)
    return item_spm

In [33]:
item_sim = cosine_sim(session_item_spm)

In [34]:
print(item_sim)

  (214507331, 214853413)	0.04926646390821466
  (214507331, 214848986)	0.037371754637596795
  (214507331, 214848641)	0.04767312946227962
  (214507331, 214845590)	0.036661778755338326
  (214507331, 214716645)	0.0436852028330519
  (214507331, 214553837)	0.07808688094430304
  (214507331, 214849059)	0.25
  (214507331, 214848600)	0.15811388300841897
  (214507331, 214837213)	0.5
  (214507331, 214710101)	0.10206207261596577
  (214507331, 214708372)	0.5
  (214507331, 214845581)	0.03747658444979307
  (214507331, 214831946)	0.02151657414559676
  (214507331, 214829880)	0.02344036154692477
  (214507331, 214819395)	0.15811388300841897
  (214507331, 214832557)	0.0562543950463012
  (214507331, 214820458)	0.11180339887498948
  (214507331, 214800262)	0.07624928516630233
  (214507331, 214783352)	0.11180339887498948
  (214507331, 214748334)	0.07808688094430304
  (214507331, 214743821)	0.08451542547285165
  (214507331, 214716120)	0.14433756729740646
  (214507331, 214648250)	0.15811388300841897
  (214507331

In [35]:
from SLIM import SLIM, SLIMatrix
# from slim_load import read_csr

def gen_slim(session_item_spm):
    model = SLIM()
    params = {'algo': 'cd', 'nthreads': 2, 'l1r': 1.0, 'l2r': 1.0}
    trainmat = SLIMatrix(session_item_spm.tocsr())
    model.train(params, trainmat)
    model.save_model(modelfname='slim_model.csr', mapfname='slim_map.csr')

    item_spm = read_csr('slim_model.csr')

    return item_spm

In [None]:
item_spm = gen_slim(session_item_spm)

In [None]:
item_spm