# 用户数据处理
（只取训练集和测试集中出现的用户ID）

数据来源于Kaggle竞赛：Event Recommendation Engine Challenge，根据
events they’ve responded to in the past
user demographic information
what events they’ve seen and clicked on in our app
用户对某个活动是否感兴趣

竞赛官网：
https://www.kaggle.com/c/event-recommendation-engine-challenge/data

用户描述信息在users.csv文件：共7维特征
user_id
locale：地区，语言
birthyear：出身年
gender：性别
joinedAt：用户加入APP的时间，ISO-8601 UTC time
location：地点
timezone：时区

# 导入工具包

In [1]:
import pandas as pd

import numpy as np
import scipy.sparse as ss
import scipy.io as sio

#保存数据
import pickle

#event的特征需要编码
from PE_utils import FeatureEng
from sklearn.preprocessing import normalize
#相似度/距离
import scipy.spatial.distance as ssd

总的用户数目超过训练集和测试集中的用户，
为节省处理时间和内存，先去处理train和test，得到竞赛需要用到的事件和用户
然后对在训练集和测试集中出现过的事件和用户建立新的ID索引
先运行user_event.ipynb,
得到事件列表文件：PE_userIndex.pkl

In [2]:
path='../../event_recommendation_engine_challenge_data/'

# 读取之前算好的测试集和训练集中出现过的用户

In [6]:
#读取训练集和测试集中出现过的用户列表
userIndex = pickle.load(open(path+"PE_userIndex.pkl", 'rb'))
n_users = len(userIndex)

print("number of users in train & test :%d" % n_users)

number of users in train & test :3391


In [7]:
userIndex

{b'3929074393': 0,
 b'2833011810': 1,
 b'3188496109': 2,
 b'3804368962': 3,
 b'3813354209': 4,
 b'1485684679': 5,
 b'402625828': 6,
 b'3064452030': 7,
 b'4243934665': 8,
 b'2626409021': 9,
 b'2424565945': 10,
 b'1800952806': 11,
 b'4170453600': 12,
 b'1665746866': 13,
 b'3765513583': 14,
 b'246547106': 15,
 b'3719575513': 16,
 b'3599572670': 17,
 b'3191212256': 18,
 b'2954540407': 19,
 b'1210062900': 20,
 b'2824647204': 21,
 b'1336647234': 22,
 b'3913824397': 23,
 b'292529766': 24,
 b'1301945337': 25,
 b'3162849104': 26,
 b'2251800772': 27,
 b'1039126403': 28,
 b'2461816241': 29,
 b'2401692695': 30,
 b'3391264132': 31,
 b'2196432988': 32,
 b'2701755218': 33,
 b'2505058685': 34,
 b'634885878': 35,
 b'767269269': 36,
 b'724978354': 37,
 b'528289771': 38,
 b'2378242378': 39,
 b'3797537201': 40,
 b'1640786388': 41,
 b'1747042580': 42,
 b'3514195773': 43,
 b'929368698': 44,
 b'2196784941': 45,
 b'2041677127': 46,
 b'1637866930': 47,
 b'3093978794': 48,
 b'19283444': 49,
 b'3387377708': 50,


In [9]:
b'3166414361' in userIndex

True

In [12]:
#读取数据
users = pd.read_csv(path+"users.csv")
users.head()

Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [13]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38209 entries, 0 to 38208
Data columns (total 7 columns):
user_id      38209 non-null int64
locale       38209 non-null object
birthyear    38209 non-null object
gender       38100 non-null object
joinedAt     38152 non-null object
location     32745 non-null object
timezone     37773 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 2.0+ MB


In [25]:
FE = FeatureEng()

#locale	birthyear	gender	joinedAt	location	timezone
#去掉user_id列
n_cols = users.shape[1] - 1
cols = ['LocaleId', 'BirthYearInt', 'GenderId', 'JoinedYearMonth', 'CountryId', 'TimezoneInt']

#users编码后的特征
#userMatrix = np.zeros((n_users, n_cols), dtype=np.int)
userMatrix = ss.dok_matrix((n_users, n_cols))

for u in range(users.shape[0]): 
    userId = str(users.loc[u,'user_id'])
    userId=bytes(userId, 'utf-8') 
    if userId in userIndex:  #在训练集或测试集中出现
        i = userIndex[userId]
        userMatrix[i, 0] = FE.getLocaleId(users.loc[u,'locale'])
        userMatrix[i, 1] = FE.getBirthYearInt(users.loc[u,'birthyear'])
        userMatrix[i, 2] = FE.getGenderId(users.loc[u,'gender'])
        userMatrix[i, 3] = FE.getJoinedYearMonth(users.loc[u,'joinedAt'])
        
        #由于地点的写法不规范，该编码似乎不起作用（所有样本的特征都被编码成0了）
        userMatrix[i, 4] = FE.getCountryId(users.loc[u,'location'])
        
        userMatrix[i, 5] = FE.getTimezoneInt(users.loc[u,'timezone'])

# 归一化用户矩阵
userMatrix = normalize(userMatrix, norm="l2", axis=0, copy=False)
sio.mmwrite(path+"US_userMatrix", userMatrix)




In [36]:
# 计算用户相似度矩阵，之后用户推荐系统
userSimMatrix = ss.dok_matrix((n_users, n_users))

#读取在测试集和训练集中出现的用户对
uniqueUserPairs = pickle.load(open(path+"FE_uniqueUserPairs.pkl", 'rb'))

#对角线元素
for i in range(0, n_users):
    userSimMatrix[i, i] = 1.0
    
#对称
for u1, u2 in uniqueUserPairs:
    #i = userIndex[u1]
    #j = userIndex[u2]
    i = u1
    j = u2
    if  (i, j) not in userSimMatrix:
        #Person相关系数做为相似度度量
        #特征：国家（locale、location）、年龄、性别、时区、地点
        #usim = ssd.correlation(userMatrix[i,:],
            #userMatrix[j,:])
    
        usim = ssd.correlation(userMatrix.getrow(i).todense(),
          userMatrix.getrow(j).todense())
        userSimMatrix[i, j] = usim
        userSimMatrix[j, i] = usim
    
sio.mmwrite(path+"US_userSimMatrix", userSimMatrix)

In [37]:
userSimMatrix.getrow(0).todense()

matrix([[1., 0., 0., ..., 0., 0., 0.]])

In [38]:
usim = ssd.correlation(userMatrix.getrow(2806).todense(),
          userMatrix.getrow(2814).todense())
usim

5.0475417160411595e-06

In [39]:
userSimMatrix[2806, 2814]

5.0475417160411595e-06