**users数据处理**

In [1]:
import pandas as pd

import numpy as np
import scipy.sparse as ss
import scipy.io as sio

#保存数据
import pickle

#event的特征需要编码
from datacleaner import DataCleaner #自定义的数据处理类
from sklearn.preprocessing import normalize
#相似度/距离
import scipy.spatial.distance as ssd

In [2]:
#读取训练集和测试集中出现过的用户列表
userIndex = pickle.load(open("PE_userIndex.pkl", 'rb'))
n_users = len(userIndex)

print("number of users in train & test :%d" % n_users)

number of users in train & test :3391


In [4]:
#读取数据
users_dpath = "C:/Users/Lzg/Desktop/data/w4/users.csv"
users = pd.read_csv(users_dpath)
users.head()

Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38209 entries, 0 to 38208
Data columns (total 7 columns):
user_id      38209 non-null int64
locale       38209 non-null object
birthyear    38209 non-null object
gender       38100 non-null object
joinedAt     38152 non-null object
location     32745 non-null object
timezone     37773 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 2.0+ MB


In [6]:
FE = DataCleaner()

#locale	birthyear	gender	joinedAt	location	timezone
#去掉user_id列
n_cols = users.shape[1] - 1
cols = ['LocaleId', 'BirthYearInt', 'GenderId', 'JoinedYearMonth', 'CountryId', 'TimezoneInt']

#users编码后的特征
#userMatrix = np.zeros((n_users, n_cols), dtype=np.int)
userMatrix = ss.dok_matrix((n_users, n_cols))

for u in range(users.shape[0]): 
    userId = str(users.loc[u,'user_id'])
    
    if userId in userIndex:  #在训练集或测试集中出现
        i = userIndex[userId]
    
        userMatrix[i, 0] = FE.getLocaleId(users.loc[u,'locale'])
        userMatrix[i, 1] = FE.getBirthYearInt(users.loc[u,'birthyear'])
        userMatrix[i, 2] = FE.getGenderId(users.loc[u,'gender'])
        userMatrix[i, 3] = FE.getJoinedYearMonth(users.loc[u,'joinedAt'])
        
        #由于地点的写法不规范，该编码似乎不起作用（所有样本的特征都被编码成0了）
        userMatrix[i, 4] = FE.getCountryId(users.loc[u,'location'])
        
        userMatrix[i, 5] = FE.getTimezoneInt(users.loc[u,'timezone'])


In [7]:
# 归一化用户矩阵
userMatrix = normalize(userMatrix, norm="l2", axis=0, copy=False)
sio.mmwrite("US_userMatrix", userMatrix)


**计算用户相似度矩阵**

In [8]:
# 计算用户相似度矩阵，之后用户推荐系统
userSimMatrix = ss.dok_matrix((n_users, n_users))

#读取在测试集和训练集中出现的用户对
uniqueUserPairs = pickle.load(open("FE_uniqueUserPairs.pkl", 'rb'))

#对角线元素
for i in range(0, n_users):
    userSimMatrix[i, i] = 1.0
    
#对称
for u1, u2 in uniqueUserPairs:
    #i = userIndex[u1]
    #j = userIndex[u2]
    i = u1
    j = u2
    if (i, j) not in userSimMatrix :
        #Person相关系数做为相似度度量
        #特征：国家（locale、location）、年龄、性别、时区、地点
        #usim = ssd.correlation(userMatrix[i,:],
            #userMatrix[j,:])
    
        usim = ssd.correlation(userMatrix.getrow(i).todense(),
          userMatrix.getrow(j).todense())
        userSimMatrix[i, j] = usim
        userSimMatrix[j, i] = usim
    
sio.mmwrite("US_userSimMatrix", userSimMatrix)

In [9]:
print (userSimMatrix)

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (8, 8)	1.0
  (9, 9)	1.0
  (10, 10)	1.0
  (11, 11)	1.0
  (12, 12)	1.0
  (13, 13)	1.0
  (14, 14)	1.0
  (15, 15)	1.0
  (16, 16)	1.0
  (17, 17)	1.0
  (18, 18)	1.0
  (19, 19)	1.0
  (20, 20)	1.0
  (21, 21)	1.0
  (22, 22)	1.0
  (23, 23)	1.0
  (24, 24)	1.0
  :	:
  (1348, 2728)	0.18944291748015085
  (3374, 875)	1.7508482464924335
  (875, 3374)	1.7508482464924335
  (1786, 443)	1.6711917858624399
  (443, 1786)	1.6711917858624399
  (165, 3237)	1.2747808363355873e-06
  (3237, 165)	1.2747808363355873e-06
  (3129, 248)	0.9023596129688828
  (248, 3129)	0.9023596129688828
  (103, 2989)	0.18949586609665192
  (2989, 103)	0.18949586609665192
  (1313, 1361)	3.1968807145954514e-07
  (1361, 1313)	3.1968807145954514e-07
  (165, 1754)	2.872339052384376e-06
  (1754, 165)	2.872339052384376e-06
  (2345, 241)	0.17460383090392984
  (241, 2345)	0.17460383090392984
  (2382, 992)	0.0010500002226244698
  (992, 2382