# 基于用户的协同过滤


## 导入相关库

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

import warnings
warnings.filterwarnings("ignore")

## 读取数据

In [None]:
names1 = ['good_id', 'good_name', 'good_type_id', 'good_price']
df_good_info = pd.read_csv('./XJTU_GOOD_INFO.txt',header=None, names=names1, sep='|')

names2 = ['good_type_id', 'good_type_desc']
df_good_type = pd.read_csv('./XJTU_GOOD_TYPE.txt',header=None, names=names2, sep='|')

names3 = ['user_name','user_id','province_id','province_name','join_date']
df_user_info = pd.read_csv('./XJTU_USERINFO.txt',header=None, names=names3, sep='|')

names4 = ['tans_no', 'user_id', 'trans_date', 'good_id', 'buy_num', 'amt']
df_trans_detail = pd.read_csv('./XJTU_GOOD_TRANS_DETAIL.txt', header=None, names=names4, sep='|')

## 数据维度审查

In [None]:
# 数据维度审查
'''
df_good_info.shape  # (6, 4)
df_good_type.shape  # (3, 2)
df_user_info.shape  # (252, 5)
df_trans_detail.shape # (100, 6)
'''

## 基于用户的协同过滤实现

In [None]:
# 计算两个数组的相似度 - 基于 哈密顿距离
def cal_sim(arr1, arr2):
    sim = 0
    for i in range(len(arr1)):
        sim += abs(arr1[i]-arr2[i])
    return sim

In [None]:
# 建立透视表 ： user_id - good_id ， 单元格表示：某个user购买某个good的总数量
df_data_pivot_goodid_userid = df_trans_detail.pivot_table(index='user_id',columns='good_id',values='buy_num',aggfunc='sum')
df_data_pivot_goodid_userid = df_data_pivot_goodid_userid.fillna(0) # Nan -> 0 

In [None]:
df_data_pivot_goodid_userid.head()

In [None]:
# 对每一列做 min-max scaler 转换到区间[0,1]内
scaler = preprocessing.MinMaxScaler()
arr_scaler = scaler.fit_transform(df_data_pivot_goodid_userid.values)

In [None]:
df_data_pivot_goodid_userid['Recommendation'] = 0  # 用于存储推荐商品的 good_id

In [None]:
for i in range(0, arr_scaler.shape[0]):
    dis = 9999
    flag = -1
    for j in range(0, arr_scaler.shape[0]):
        if (i != j):
            if (dis > cal_sim(arr_scaler[i, :], arr_scaler[j, :])):
                dis = cal_sim(arr_scaler[i, :], arr_scaler[j, :])
                flag = j

    #df_data_pivot_goodid_userid.iloc[i, -1] = flag
    #df_data_pivot_goodid_userid.iloc[i, -1] = np.argmax(df_data_pivot_goodid_userid.iloc[24, 0:-1].values)
    
    # flag : 最相似用户的所在的行数
    # df_data_pivot_goodid_userid.index[flag] : 最相似用户的 user_id
    # np.argmax(df_data_pivot_goodid_userid.iloc[flag,0:-1]) ： 最相似用户购买总数最多的商品对应的 good_id
    df_data_pivot_goodid_userid.iloc[i, -1] = np.argmax(df_data_pivot_goodid_userid.iloc[flag, 0:-1])


In [None]:
'''
sum(df_data_pivot_goodid_userid.values[:,0]), sum(df_data_pivot_goodid_userid.values[:,1]), sum(df_data_pivot_goodid_userid.values[:,2]), \
   sum(df_data_pivot_goodid_userid.values[:,3]), sum(df_data_pivot_goodid_userid.values[:,3])
'''
# (130.0, 116.0, 88.0, 92.0, 92.0)
# 统计每个商品(good_id)被购买的总数，以最大的作为“基准”，应付冷启动问题
# 即：对于每个没有买过商品的用户，推荐“基准”商品，这里对应就是good_id = 1001 的商品

In [None]:
# 传入user_id， 输出推荐的good_id
def recom(user_id):
    if (user_id in df_data_pivot_goodid_userid.index):
        return int(df_data_pivot_goodid_userid.ix[user_id, -1])
    else:
        return 1001

In [None]:
# 有过购买行为的user_id 的 以往购买行为 和 推荐结果
df_data_pivot_goodid_userid.head()

## 推荐测试

In [None]:
# test case: hanwei pangjie  （有过购买行为的user_id)
# test case: shichuan  (没有过购买行为的user_id)
s1 = input("请输入需要推荐的顾客的user_id: ")
print(recom(s1), df_good_info[df_good_info['good_id'] == recom(s1)]['good_name'].values[0])