# 基于用户的协同过滤
- 东南大学软件学院 第15组
- 组长：205512 王颢迪
- 组员：205557 林泓宇、205554 宋希宁、205458 李乐翔、205558 罗世威
205550 陈章权

## 导入相关库

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from math import *

import warnings
warnings.filterwarnings("ignore")

## 读取数据

In [2]:
names1 = ['good_id', 'good_name', 'good_type_id', 'good_price']
df_good_info = pd.read_csv('./XJTU_GOOD_INFO.txt',header=None, names=names1, sep='|')

names2 = ['good_type_id', 'good_type_desc']
df_good_type = pd.read_csv('./XJTU_GOOD_TYPE.txt',header=None, names=names2, sep='|')

names3 = ['user_name','user_id','province_id','province_name','join_date']
df_user_info = pd.read_csv('./XJTU_USERINFO.txt',header=None, names=names3, sep='|')

names4 = ['trans_no', 'user_id', 'trans_date', 'good_id', 'buy_num', 'amt']
df_trans_detail = pd.read_csv('./XJTU_GOOD_TRANS_DETAIL.txt', header=None, names=names4, sep='|')

## 数据维度审查

In [3]:
# 数据维度审查
'''
df_good_info.shape  # (6, 4)
df_good_type.shape  # (3, 2)
df_user_info.shape  # (252, 5)
df_trans_detail.shape # (100, 6)
'''

'\ndf_good_info.shape  # (6, 4)\ndf_good_type.shape  # (3, 2)\ndf_user_info.shape  # (252, 5)\ndf_trans_detail.shape # (100, 6)\n'

## 基于用户的协同过滤实现

In [4]:
# 计算两个数组的相似度 - 基于 哈密顿距离
def cal_sim(arr1, arr2):
    sim = 0
    for i in range(len(arr1)):
        sim += abs(arr1[i]-arr2[i])
    return sim

In [5]:
# 建立透视表 ： user_id - good_id ， 单元格表示：某个user购买某个good的总数量
df_data_pivot_goodid_userid = df_trans_detail.pivot_table(index='user_id',columns='good_id',values='buy_num',aggfunc='sum')
df_data_pivot_goodid_userid = df_data_pivot_goodid_userid.fillna(0) # Nan -> 0 

In [6]:
df_data_pivot_goodid_userid.head()

good_id,1001,1002,2001,2002,3001
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
baiyc,0.0,0.0,0.0,0.0,5.0
borh,6.0,0.0,0.0,0.0,0.0
caolin_bds,5.0,0.0,0.0,0.0,0.0
chengyy,6.0,0.0,0.0,0.0,0.0
chenhx,6.0,0.0,0.0,0.0,0.0


In [7]:
# 对每一列做 min-max scaler 转换到区间[0,1]内
scaler = preprocessing.MinMaxScaler()
arr_scaler = scaler.fit_transform(df_data_pivot_goodid_userid.values)

In [8]:
arr_scaler

array([[0.        , 0.        , 0.        , 0.        , 0.625     ],
       [0.33333333, 0.        , 0.        , 0.        , 0.        ],
       [0.27777778, 0.        , 0.        , 0.        , 0.        ],
       [0.33333333, 0.        , 0.        , 0.        , 0.        ],
       [0.33333333, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.25      ],
       [0.        , 0.88888889, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.25      ],
       [0.        , 1.        , 0.        , 0.        , 0.        ],
       [0.05555556, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.875     ],
       [0.        , 1.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.30769231, 0.        ],
       [0.        , 0.        , 0.        , 0.23076923, 0.        ],
       [0.        , 0.        , 0.

In [9]:
# 建立购买行为表，查询用户是否对某件商品有过购买行为
# 0代表没有，1代表有
df_buy_or_not = df_data_pivot_goodid_userid.copy()

for i in range(df_buy_or_not.shape[0]):
    for j in range(df_buy_or_not.shape[1]):
        if (df_buy_or_not.iloc[i,j] > 0):
            df_buy_or_not.iloc[i,j] = 1

In [10]:
def recom(user_id):
    
    #user_id = 'zhunl' # 传入参数

    res_1 = []
    index_of_user = list(df_data_pivot_goodid_userid.index).index(user_id) # 获取user_id在透视表中的行数
    for i in range(arr_scaler.shape[0]):
        dis = cal_sim(arr_scaler[index_of_user,:], arr_scaler[i, :])
        #print(dis)

        res_1.append(dis)

    res_s1 = pd.Series(data=res_1)  
    res_s1 = res_s1.nsmallest(arr_scaler.shape[0]) # 根据哈密顿 从小到大排序

    for i in range(len(res_s1.index)):
        compare_arr = df_buy_or_not.iloc[res_s1.index[i], :].values - df_buy_or_not.loc[user_id, :].values
        for j in range(0, len(compare_arr)):
            if (compare_arr[j] > 0):
                return df_data_pivot_goodid_userid.columns[j]

In [11]:
# 冷启动问题
'''
sum(df_data_pivot_goodid_userid.values[:,0]), sum(df_data_pivot_goodid_userid.values[:,1]), sum(df_data_pivot_goodid_userid.values[:,2]), \
   sum(df_data_pivot_goodid_userid.values[:,3]), sum(df_data_pivot_goodid_userid.values[:,3])
'''
# (130.0, 116.0, 88.0, 92.0, 92.0)
# 统计每个商品(good_id)被购买的总数，以最大的作为“基准”，应付冷启动问题
# 即：对于每个没有买过商品的用户，推荐“基准”商品，这里对应就是good_id = 1001 的商品

'\nsum(df_data_pivot_goodid_userid.values[:,0]), sum(df_data_pivot_goodid_userid.values[:,1]), sum(df_data_pivot_goodid_userid.values[:,2]),    sum(df_data_pivot_goodid_userid.values[:,3]), sum(df_data_pivot_goodid_userid.values[:,3])\n'

In [12]:
# for i in range(0, arr_scaler.shape[0]):
#     dis = 9999
#     flag = -1
#     for j in range(0, arr_scaler.shape[0]):
#         if (i != j):
#             if (dis > cal_sim(arr_scaler[i, :], arr_scaler[j, :])):
#                 dis = cal_sim(arr_scaler[i, :], arr_scaler[j, :])
#                 flag = j

#     #df_data_pivot_goodid_userid.iloc[i, -1] = flag
#     #df_data_pivot_goodid_userid.iloc[i, -1] = np.argmax(df_data_pivot_goodid_userid.iloc[24, 0:-1].values)
    
#     # flag : 最相似用户的所在的行数
#     # df_data_pivot_goodid_userid.index[flag] : 最相似用户的 user_id
#     # np.argmax(df_data_pivot_goodid_userid.iloc[flag,0:-1]) ： 最相似用户购买总数最多的商品对应的 good_id
#     df_data_pivot_goodid_userid.iloc[i, -1] = np.argmax(df_data_pivot_goodid_userid.iloc[flag, 0:-1])


## 推荐测试

In [13]:
'''
# test case: hanwei pangjie  （有过购买行为的user_id)
# test case: shichuan  (没有过购买行为的user_id)
s1 = input("请输入需要推荐的顾客的user_id: ")
print(recom(s1))
'''

'\n# test case: hanwei pangjie  （有过购买行为的user_id)\n# test case: shichuan  (没有过购买行为的user_id)\ns1 = input("请输入需要推荐的顾客的user_id: ")\nprint(recom(s1))\n'

In [14]:
def recom_output(user_id):
    if (user_id in df_data_pivot_goodid_userid.index):
        recom_id = recom(user_id)
    else:
        recom_id = 1001
        
    str_recom_out = str(recom_id) + str(' ') + df_good_info[df_good_info['good_id']==recom_id]['good_name'].values[0]
    
    return str_recom_out

In [15]:
recom_output('zhunl')

'2002 洗发水'

## 推荐可视化

In [16]:
#首先导入tk
import tkinter as tk

window=tk.Tk()
window.title('商品推荐系统')
window.geometry('500x350+300+200')#窗口大小

ipt=tk.Entry(window,show=None,font=('微软雅黑',20))#show代表显示，如果是输入密码的话show='*'，这样显示就是*号

ipt.pack()#布局

def button():#button函数方法
    val=ipt.get()#获取输入的值
    #print(val) # 可记录输入
    rs = recom_output(val)
    #rs = str(recom(val)) + str(' ') + df_good_info[df_good_info['good_id']==recom(val)]['good_name'].values[0]
    value.set(rs)#将输入的值赋给value

b = tk.Button(window,text='推荐',width=15,height=2,command=button)#定义一个button，text为button，command为其绑定一个函数方法
b.pack()
value=tk.StringVar()#定义字符类型的变量，tk.IntVar()定义数字类型的变量
l=tk.Label(window,textvariable=value,bg='white',font=('微软雅黑',20),width=20,height=2)
l.pack()

window.mainloop()#显示窗口