### import modules

In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
import turicreate as tc
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### dealing with data

In [39]:
path = 'BlackFriday.csv'
data = pd.read_csv(path)
data = data.drop(['Gender', 'Age', 'Occupation', 'City_Category',
                  'Stay_In_Current_City_Years', 'Marital_Status',
                  'Product_Category_1', 'Product_Category_2',
                  'Product_Category_3', 'Purchase'], axis=1)

data2 = data.groupby(['User_ID']).count()
df = pd.DataFrame({'User_ID':data2['Product_ID'].index, 'countProduct':data2['Product_ID'].values})
data = data.join(df.set_index('User_ID'), on='User_ID').sort_values(['User_ID'])

# drop data whose countProduct is less than 300
data = data[data.countProduct >= 300]
data = data.reset_index(drop = True)

# data about users (312 users)
users = pd.DataFrame(data.User_ID.unique())
users.columns = ['User_ID']
users_length = len(users)

# data about products
products = pd.DataFrame(data.Product_ID.unique())
products.columns = ['Product_ID']
products_length = len(products)

data['purchased'] = 1

df_matrix = pd.pivot_table(data, values='purchased', index='User_ID', columns='Product_ID', fill_value=0)

# df has user and countProduct whose countProduct >= 300
user_cnt_info = pd.DataFrame({'User_ID':data['User_ID'], 'countProduct':data['countProduct']})
user_cnt_info = user_cnt_info.groupby(['User_ID', 'countProduct']).count().reset_index()

### split the data and store

In [None]:
'''train, val, test data 를 만드는 cell => 초기에만 실행할 것'''

test = pd.DataFrame()
val = pd.DataFrame()
train = pd.DataFrame()

i = 0
j = 0

while i < len(data):
    cnt = data.loc[i].countProduct
    tmp = data[i: i+cnt]
    tmp = shuffle(tmp).reset_index()
    test = test.append(tmp[0:10])
    val = val.append(tmp[10:20])
    train = train.append(tmp[20:])
    j = j + 1
    i = i + cnt
    
train.to_csv("train_data.csv", mode='w')
val.to_csv("val_data.csv", mode='w')
test.to_csv("test_data.csv", mode='w')

In [64]:
'''저장되어 있는 train, val, test data 를 사용하는 경우 실행.'''

train = pd.read_csv('train_data.csv')
val = pd.read_csv('val_data.csv')
test = pd.read_csv('test_data.csv')

train = train.drop(['Unnamed: 0', 'index'], axis=1)
val = val.drop(['Unnamed: 0', 'index'], axis=1)
test =test.drop(['Unnamed: 0', 'index'], axis=1)

In [158]:
train_matrix = pd.pivot_table(train, values='purchased', index='User_ID', columns='Product_ID', fill_value=0)

train_products = pd.DataFrame(train.Product_ID.unique())
train_products.columns = ['Product_ID']
train_products_length = len(train_products)

val_matrix = pd.pivot_table(val, values='purchased', index='User_ID', columns='Product_ID', fill_value=0)

val_products = pd.DataFrame(val.Product_ID.unique())
val_products.columns = ['Product_ID']
val_products = val_products.sort_values('Product_ID')
val_products_length = len(val_products)

val_user_info = pd.DataFrame(val.User_ID.unique())
val_user_info.columns = ['User_ID']
val_users_length = len(val_user_info)

### defining cosine similarity module

In [229]:
def cos_sim(A, B):
    if(norm(A)==0 or norm(B)==0):
        return 0
    else:
        return dot(A, B)/(norm(A)*norm(B))    

### user based CF

#### user - user 사이의 유사도를 계산함.

In [None]:
'''user_user matrix 를 만드는 cell => 초기에만 실행할 것'''
#user 사이의 유사도를 계산함.
user_user_matrix = np.zeros(shape=(users_length,users_length))

i = 0
j = 0
for user1 in users['User_ID']:
    user_1 = np.array(train_matrix.loc[user1])
    j = 0
    for user2 in users['User_ID']:
        user_2 = np.array(train_matrix.loc[user2])
        user_user_matrix[i][j] = cos_sim(user_1, user_2)
        j = j+1
    i = i+1
    
np.savetxt('user_user_matrix.csv', user_user_matrix, delimiter=',')

In [21]:
'''저장한 user_user matrix를 이용하는 경우 실행함.'''
user_user_matrix = pd.read_csv('user_user_matrix.csv', header = None)

#### 유사도가 높은 5명의 user 로 부터 추천 받아 recom_matrix 에 저장.

In [None]:
'''recom_matrix 를 계산해야 하는 경우 실행함.'''
# recommendation matrix
recom_matrix = np.zeros(shape=(users_length,products_length))

i = 0
j = 0
cnt = 0
# user i와 j가 유사할 경우, i 에게 j가 산 물건을 추천한다. 

while(i<users_length):
    j = i+1
    cnt = 0
    while(j<users_length) :
        if(user_user_matrix.loc[i][j] > 0.2 and cnt < 5):
            cnt = cnt+1
            for k, product in enumerate(train_products['Product_ID']):
                # i 가 사지 않은 물건인데, j가 샀을 경우 i 에게 추천.
                if(train_matrix.loc[users.loc[i]][product].values==0 and 
                   train_matrix.loc[users.loc[j]][product].values==1):
                    print(i, k)
                    recom_matrix[i][k] = 1
                    print(recom_matrix[i][k])
        if (cnt == 5):
            break;
        j = j+1
    i = i+1

In [6]:
'''저장된 recom_matrix 를 사용할 경우 실행함.'''
recom_matrix = pd.read_csv('recom_matrix.csv', header=None)

#### recom_matrix (predicted) 와 val_matrix (observed) 비교.<br></br>=> 각 user 별 vector 사이의 유사도를 계산하여 similairty_matrix 에 저장함.

In [99]:
# recom_matrix, val_matrix 의 비교를 편하게 하기 위해 product id 와 index 사이의 dictionary 를 만들고, 관리할 것임.
def swap_dictionary(original_dict):
    temp_dict = {}
    dict_list = original_dict.items()
    for i in dict_list:
        temp_dict[i[1]] = i[0]
    return temp_dict

In [98]:
tmp = products.to_dict()
swaped_products = swap_dictionary(tmp['Product_ID'])

In [236]:
# recom_matrix 에는 모든 product 가 column 으로 들어가 있음.
# new_recom 에는 val_matrix 에 있는 product 만 뽑아서 정리함.
# 이렇게 정리함으로써, new_recom 과 val_matrix 의 row vector 사이의 유사도를 계산할 수 있음. (차원을 맞춰 줌.)

new_recom = pd.DataFrame()

for product in val_products['Product_ID']:
    tmp = recom_matrix[swaped_products[product]]
    tmp = tmp.to_frame(name=product)
    new_recom = pd.merge(new_recom, tmp, left_index=True, right_index=True, how="outer")

In [239]:
# similarity matrix 를 정의하고, new_recom 과 val_matrix 의 row vector 사이의 유사도를 계산하여 저장함.

similarity_matrix = np.zeros(shape=(val_users_length, 1))

for i, user1 in enumerate(val_user_info['User_ID']):
    user_recom = np.array(new_recom.loc[i]).astype(int)
    user_val = np.array(val_matrix.loc[user1])
    similarity_matrix[i] = cos_sim(user_recom, user_val)
    
similarity_matrix = pd.DataFrame(similarity_matrix)
# similarity_matrix 의 index 는 users 의 index 와 같으며, users 를 통해 user_ID 로 접근할 수 있다.

0 결과가 나온 것은, user_recom, user_val 둘 중 하나의 norm 이 0 인 경우도 포함.

In [241]:
similarity_matrix.describe()

Unnamed: 0,0
count,312.0
mean,0.012546
std,0.024038
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.088544


가장 유사도가 높은 것은 0.08임!!!! 즉, 1606 개의 product 중 128 개 정도의 product 를 맞춘 것이라고 할 수 잇음.

In [233]:
val_products_length

1606