# 对活动数据进行分析
（只取训练集和测试集中出现的样本）

数据来源于Kaggle竞赛：Event Recommendation Engine Challenge，根据
    events they’ve responded to in the past
    user demographic information
    what events they’ve seen and clicked on in our app
预测用户对某个活动是否感兴趣

竞赛官网：
https://www.kaggle.com/c/event-recommendation-engine-challenge/data


活动描述信息在events.csv文件：共110维特征
前9列：event_id, user_id, start_time, city, state, zip, country, lat, and lng.
event_id：id of the event, 
user_id：id of the user who created the event.  
city, state, zip, and country： more details about the location of the venue (if known).
lat and lng： floats（latitude and longitude coordinates of the venue）
start_time： 字符串，ISO-8601 UTC time，表示活动开始时间

后101列为词频：count_1, count_2, ..., count_100，count_other
count_N：活动描述出现第N个词的次数
count_other：除了最常用的100个词之外的其余词出现的次数

这里我们用count_1, count_2, ..., count_100，count_other属性做聚类，即活动用这些关键词来描述，可表示活动的类别

# 导入工具包

In [2]:
#数据量太大，pdandas不能一次讲所有数据读入
#也可以用pandas,一次读取部分数据，可以参考：https://www.cnblogs.com/datablog/p/6127000.html
#import pandas as pd

import numpy as np
import scipy.sparse as ss
import scipy.io as sio

#保存数据
import pickle

#event的特征需要编码
from PE_utils import FeatureEng
from sklearn.preprocessing import normalize
#相似度/距离
import scipy.spatial.distance as ssd

In [3]:
path='../../event_recommendation_engine_challenge_data/'

# 统计活动数目

In [4]:
#读取数据，并统计有多少不同的events
#其实EDA.ipynb中用read_csv已经统计过了
lines = 0
fin = open(path+"used_Event.csv", 'rb')
#找到用C/C++的感觉了
#字段：event_id, user_id,start_time, city, state, zip, country, lat, and lng， 101 columns of words count
fin.readline() # skip header，列名行
for line in fin:
    cols = line.strip().split(b",")
    lines += 1
fin.close()

print("number of records :%d" % lines)

number of records :13418


活动数目太多（300w+），训练+测试集的活动没这么多，所有先去处理train和test，得到竞赛需要用到的活动和用户
然后对在训练集和测试集中出现过的活动和用户建立新的ID索引
先运行user_event.ipynb,
得到活动列表文件：PE_eventIndex.pkl

# 读取之前算好的测试集和训练集中出现过的活动
详见user_event.ipynb

In [5]:
#读取训练集和测试集中出现过的活动列表
eventIndex = pickle.load(open(path+"PE_eventIndex.pkl", 'rb'))
n_events = len(eventIndex)

print("number of events in train & test :%d" % n_events)

number of events in train & test :13418


# 处理events.csv --> 特征编码、活动之间的相似度

In [14]:
FE = FeatureEng()

fin = pd.read(path+"used_Event.csv", 'rb')

#字段：event_id, user_id,start_time, city, state, zip, country, lat, and lng， 101 columns of words count
fin.readline() # skip header

#start_time, city, state, zip, country, lat, and lng
eventPropMatrix = ss.dok_matrix((n_events, 7))

#词频特征
eventContMatrix = ss.dok_matrix((n_events, 101))

for line in fin.readlines():
    cols = line.strip().split(b",")
    eventId = str(cols[1])
    print(eventId)
    if eventId in eventIndex:  #在训练集或测试集中出现
        i = eventIndex[eventId]
        print(i)
        #event的特征编码，这里只是简单处理，其实开始时间，地点等信息很重要
        eventPropMatrix[i, 0] = FE.getJoinedYearMonth(cols[3]) # start_time
        eventPropMatrix[i, 1] = FE.getFeatureHash(cols[4]) # city
        eventPropMatrix[i, 2] = FE.getFeatureHash(cols[5]) # state
        eventPropMatrix[i, 3] = FE.getFeatureHash(cols[6]) # zip
        eventPropMatrix[i, 4] = FE.getFeatureHash(cols[7]) # country
        eventPropMatrix[i, 5] = FE.getFloatValue(cols[8]) # lat
        eventPropMatrix[i, 6] = FE.getFloatValue(cols[9]) # lon
        
        #词频
        for j in range(10, 111):
            eventContMatrix[i, j-9] = cols[j]
fin.close()

#用L2模归一化
eventPropMatrix = normalize(eventPropMatrix,
    norm="l2", axis=0, copy=False)
sio.mmwrite(path+"EV_eventPropMatrix", eventPropMatrix)

#词频，可以考虑我们用这部分特征进行聚类，得到活动的genre
eventContMatrix = normalize(eventContMatrix,
    norm="l2", axis=0, copy=False)
sio.mmwrite(path+"EV_eventContMatrix", eventContMatrix)


# calculate similarity between event pairs based on the two matrices
eventPropSim = ss.dok_matrix((n_events, n_events))
eventContSim = ss.dok_matrix((n_events, n_events))

#读取在测试集和训练集中出现的活动对
uniqueEventPairs = pickle.load(open(path+"PE_uniqueEventPairs.pkl", 'rb'))



b'684921758'
b'244999119'
b'3928440935'
b'2582345152'
b'1051165850'
b'1212611096'
b'3689283674'
b'2584113432'
b'3365728297'
b'2912638473'
b'1609864127'
b'1304227508'
b'2608543989'
b'298169907'
b'2953099360'
b'615449287'
b'1922719636'
b'1261820355'
b'2773204108'
b'2285783902'
b'1873976153'
b'1820269907'
b'1929622843'
b'2312158323'
b'1091130052'
b'1888241344'
b'3436633625'
b'1511862915'
b'3980763324'
b'2259674237'
b'104397174'
b'2352676247'
b'163395593'
b'268215542'
b'4202927804'
b'40348900'
b'940258698'
b'3154390339'
b'3685386977'
b'31247346'
b'2587616435'
b'4219761316'
b'297856962'
b'3191079645'
b'4057770067'
b'4004496621'
b'739705932'
b'2219919748'
b'2998372996'
b'920467258'
b'3555202073'
b'1145166049'
b'895832538'
b'2828937719'
b'2165261948'
b'1081122807'
b'3161434996'
b'905503405'
b'3028497756'
b'2243576360'
b'3950286482'
b'2311647821'
b'2038082048'
b'679148240'
b'2439597708'
b'735593165'
b'2126750158'
b'3611263082'
b'963138294'
b'2622371373'
b'732997628'
b'4216192606'
b'2569196977'

b'3044687724'
b'3573758696'
b'878979384'
b'3149791577'
b'3344442885'
b'2119816010'
b'641594474'
b'2471195693'
b'2143656308'
b'1901044782'
b'2059512399'
b'2532068732'
b'1164582156'
b'3658812660'
b'3723741723'
b'1262217378'
b'3549638872'
b'1916128709'
b'1215684223'
b'2280692735'
b'618917454'
b'2508841697'
b'3742929052'
b'2962466426'
b'1515381265'
b'2837973453'
b'1876364254'
b'1175478989'
b'698797515'
b'750399105'
b'1343107637'
b'682667459'
b'3728742842'
b'2676714275'
b'2757258727'
b'4073488660'
b'2733928937'
b'1753430045'
b'627006212'
b'1279105661'
b'2414905031'
b'2842497966'
b'1729757002'
b'1872538468'
b'3090997583'
b'1146784351'
b'2230413417'
b'2837476507'
b'3980699306'
b'1630490032'
b'1988027954'
b'339349606'
b'2731275763'
b'1992296647'
b'71532506'
b'3783327786'
b'1263636699'
b'1075717146'
b'3955420874'
b'2883538293'
b'98535486'
b'2901846698'
b'1598393308'
b'590488063'
b'140441656'
b'2434532267'
b'53942731'
b'572783070'
b'2981167077'
b'442959264'
b'4191185830'
b'277703982'
b'293318287

b'3270322093'
b'3407311067'
b'3866466210'
b'2489624476'
b'1371415647'
b'1352928000'
b'3307793700'
b'2790637805'
b'2683060847'
b'1349521729'
b'3929264175'
b'1037264024'
b'100961464'
b'1548928340'
b'1698013825'
b'602394192'
b'1432813792'
b'491161389'
b'1281457075'
b'3392885130'
b'872988283'
b'4198564536'
b'713643675'
b'2995369168'
b'1404500601'
b'3741020356'
b'2559164171'
b'2716503088'
b'3167570493'
b'429783158'
b'955692500'
b'1811322299'
b'3463879879'
b'3658087051'
b'1334001859'
b'1607345926'
b'3635358150'
b'931663656'
b'949540452'
b'3890693055'
b'2463159061'
b'3931516549'
b'2403415191'
b'1315709702'
b'2520129462'
b'1175479323'
b'4101475674'
b'2413506174'
b'1731720858'
b'3008504566'
b'1764880922'
b'767782490'
b'1775142948'
b'891266900'
b'1405785624'
b'1243097755'
b'406031080'
b'613851355'
b'3421650325'
b'3272296205'
b'1485857067'
b'3915188082'
b'161579832'
b'2899171328'
b'3076799165'
b'3061224963'
b'3264734489'
b'3702386514'
b'2033808525'
b'3641591598'
b'2073417105'
b'2740924995'
b'1548

b'3668324750'
b'1327533901'
b'811008612'
b'804619862'
b'736236994'
b'2322818360'
b'872605952'
b'612644906'
b'4252287704'
b'3601961332'
b'4080537914'
b'3612754071'
b'4214464123'
b'393394477'
b'3067222491'
b'207490276'
b'2169756279'
b'3745441097'
b'1670889080'
b'1277292991'
b'2532930501'
b'2965867805'
b'510796094'
b'2589273566'
b'3599562821'
b'4182255726'
b'294072770'
b'2705765590'
b'173841577'
b'3185730461'
b'1750473893'
b'1153082957'
b'533904091'
b'2393885244'
b'2604206550'
b'3197423454'
b'3291858715'
b'3427185275'
b'708064207'
b'3665425666'
b'2534283645'
b'3632963428'
b'1708950739'
b'2425935882'
b'2097501529'
b'2167442343'
b'1753679456'
b'3319862596'
b'2470509282'
b'2680781138'
b'3566218350'
b'3222805942'
b'2104725357'
b'3514254161'
b'1759182938'
b'3610905380'
b'1103896952'
b'2695471827'
b'1332759137'
b'2468101903'
b'939765972'
b'3106536799'
b'1670218702'
b'1680138'
b'1975759984'
b'4100846290'
b'1106103765'
b'724813702'
b'1522597700'
b'3426952204'
b'268233790'
b'1775336793'
b'23056899

b'679447655'
b'2802110996'
b'1862970065'
b'3968395368'
b'3951780197'
b'990323474'
b'1409053740'
b'1401946527'
b'2713137893'
b'1686015754'
b'1319055966'
b'2264712932'
b'2587175764'
b'2798862419'
b'4192982636'
b'4035812423'
b'532732480'
b'1116079728'
b'1553332170'
b'1554642353'
b'1832065589'
b'3358028399'
b'2828360569'
b'2504949294'
b'3145188541'
b'4199166902'
b'3722331766'
b'3989395314'
b'3771339943'
b'2010172644'
b'508487235'
b'2175520870'
b'2421809173'
b'573700757'
b'4193798139'
b'2129049769'
b'2000635564'
b'3470999302'
b'2374000439'
b'2062625787'
b'1615563600'
b'3339191264'
b'1679778495'
b'3849306291'
b'1439840835'
b'1239204675'
b'1796930021'
b'3213727494'
b'1718512557'
b'1170112049'
b'41642879'
b'1778085609'
b'160567780'
b'2657802620'
b'918139741'
b'2271049585'
b'1812388516'
b'4156355007'
b'3967878081'
b'3921130037'
b'2586730902'
b'1805425126'
b'3869122811'
b'1303026980'
b'18300872'
b'3266931065'
b'2700042611'
b'826425232'
b'2085933053'
b'882150572'
b'1110206058'
b'1958801669'
b'259

In [None]:
for e1, e2 in uniqueEventPairs:
    #i = eventIndex[e1]
    #j = eventIndex[e2]
    i = e1
    j = e2
    
    #非词频特征，采用Person相关系数作为相似度
    if not (i,j) in eventPropSim:
        epsim = ssd.correlation(eventPropMatrix.getrow(i).todense(),
            eventPropMatrix.getrow(j).todense())
        #print(eventPropMatrix.getrow(i).todense())
        eventPropSim[i, j] = epsim
        eventPropSim[j, i] = epsim
    #对词频特征，采用余弦相似度，也可以用直方图交/Jacard相似度
    if not (i,j) in eventContSim:
        ecsim = ssd.cosine(eventContMatrix.getrow(i).todense(),
            eventContMatrix.getrow(j).todense())
    
        eventContSim[i, j] = epsim
        eventContSim[j, i] = epsim
    
sio.mmwrite(path+"EV_eventPropSim", eventPropSim)
sio.mmwrite(path+"EV_eventContSim", eventContSim)

In [6]:
eventPropSim.getrow(0).todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [16]:
a=eventPropSim>0
a.getrow(0).todense()

  res = self._with_data(op(self.data, other), copy=True)


matrix([[False, False, False, ..., False, False, False]])