In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
from tqdm import tqdm_notebook
import os
import time
import logging
from sklearn.preprocessing import LabelEncoder

In [2]:
tic = time.time()

In [3]:
# 减少内存占用
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
# 解析列表， 重编码id
def parse_str(d):
    return np.array(list(map(float, d.split())))

def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))

def parse_list_2(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[2:]), str(d).split(',')))

def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))

In [10]:
PATH = '/home/huangqiao/DataMiningCode/project-expert/data'
SAVE_PATH = 'pkl'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)

In [11]:
#--------------------------------------------single word---------------------------#
single_word = pd.read_csv(os.path.join(PATH, 'single_word_vectors_64d.txt'), names=['id', 'embed'], sep='\t')
single_word.head()

Unnamed: 0,id,embed
0,SW1,-0.985937 0.11307016 0.012898494 -0.6822068 -0...
1,SW2,-0.3367663 0.039051324 0.8155926 0.8351733 -0....
2,SW3,0.3074205 -1.0977745 0.7528213 0.6299011 0.197...
3,SW4,0.61494493 0.5444025 2.0673835 3.2731245 0.779...
4,SW5,-1.0922098 -2.088952 -1.9467407 -0.095274135 0...


In [12]:
#把embed变成列表  id变成int
single_word['embed'] = single_word['embed'].apply(parse_str)
single_word['id'] = single_word['id'].apply(lambda x: int(x[2:]))
single_word.head()

Unnamed: 0,id,embed
0,1,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
1,2,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."
2,3,"[0.3074205, -1.0977745, 0.7528213, 0.6299011, ..."
3,4,"[0.61494493, 0.5444025, 2.0673835, 3.2731245, ..."
4,5,"[-1.0922098, -2.088952, -1.9467407, -0.0952741..."


In [13]:
with open('pkl/single_word.pkl', 'wb') as file:
    pickle.dump(single_word, file)

del single_word
gc.collect()

0

In [14]:
#---------------------word-----------------------------#

word = pd.read_csv(os.path.join(PATH, 'word_vectors_64d.txt'), 
                          names=['id', 'embed'], sep='\t')
word.head()

Unnamed: 0,id,embed
0,W1,0.12561196 -0.57268924 -0.14478925 -0.05249426...
1,W2,3.224765 2.2482696 -0.511986 -0.5329892 -0.943...
2,W3,-0.985937 0.11307016 0.012898494 -0.6822068 -0...
3,W4,-0.3367663 0.039051324 0.8155926 0.8351733 -0....
4,W5,0.3074205 -1.0977745 0.7528213 0.6299011 0.197...


In [15]:
#把embed变成列表  id变成int
word['embed'] = word['embed'].apply(parse_str)
word['id'] = word['id'].apply(lambda x: int(x[1:]))
word.head()

Unnamed: 0,id,embed
0,1,"[0.12561196, -0.57268924, -0.14478925, -0.0524..."
1,2,"[3.224765, 2.2482696, -0.511986, -0.5329892, -..."
2,3,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
3,4,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."
4,5,"[0.3074205, -1.0977745, 0.7528213, 0.6299011, ..."


In [16]:
with open('pkl/word.pkl', 'wb') as file:
    pickle.dump(word, file)
    
del word
gc.collect()

0

In [17]:
#---------------------topic-----------------------------#
topic = pd.read_csv(os.path.join(PATH, 'topic_vectors_64d.txt'), 
                          names=['id', 'embed'], sep='\t')
topic.head()

Unnamed: 0,id,embed
0,T1,0.16508673 -0.0037432343 -0.058245048 -0.00134...
1,T2,1.608256 -1.0515573 -1.1897708 1.1820835 -0.80...
2,T3,3.3307428 -0.43252096 -2.1518784 -1.4390031 2....
3,T4,2.4698818 -0.12998039 -0.4648351 0.8796743 -0....
4,T5,1.562477 -1.3560516 -0.3271215 -0.063419074 -0...


In [18]:
#把embed变成列表  id变成int
topic['embed'] = topic['embed'].apply(parse_str)
topic['id'] = topic['id'].apply(lambda x: int(x[1:]))
topic.head()

Unnamed: 0,id,embed
0,1,"[0.16508673, -0.0037432343, -0.058245048, -0.0..."
1,2,"[1.608256, -1.0515573, -1.1897708, 1.1820835, ..."
2,3,"[3.3307428, -0.43252096, -2.1518784, -1.439003..."
3,4,"[2.4698818, -0.12998039, -0.4648351, 0.8796743..."
4,5,"[1.562477, -1.3560516, -0.3271215, -0.06341907..."


In [19]:
with open('pkl/topic.pkl', 'wb') as file:
    pickle.dump(topic, file)
    
del topic
gc.collect()

0

In [21]:
#---------------------invite-----------------------------#
train = pd.read_csv(os.path.join(PATH, 'invite_info_0926.txt'), 
                          names=['qid', 'uid', 'dt', 'label'], sep='\t')
test = pd.read_csv(os.path.join(PATH, 'invite_info_evaluate_1_0926.txt'), 
                          names=['qid', 'uid', 'dt'], sep='\t')
train.head()

Unnamed: 0,qid,uid,dt,label
0,Q2166419046,M401693808,D3865-H22,0
1,Q1550017551,M3392373099,D3844-H11,0
2,Q604029601,M2317670257,D3862-H15,0
3,Q2350061229,M1618461867,D3849-H11,0
4,Q2443223942,M3544409350,D3867-H4,0


In [22]:
# train['invite_day'] = train['invite_time'].apply(lambda x: int(x.split('-')[0][1:])).astype(np.int16)
# train['invite_hour'] = train['invite_time'].apply(lambda x: int(x.split('-')[1][1:])).astype(np.int8)
# test['invite_day'] = test['invite_time'].apply(lambda x: int(x.split('-')[0][1:])).astype(np.int16)
# test['invite_hour'] = test['invite_time'].apply(lambda x: int(x.split('-')[1][1:])).astype(np.int8)
# train = reduce_mem_usage(train)
# train.head()


def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']


In [23]:
train.head()

Unnamed: 0,qid,uid,label,day,hour
0,Q2166419046,M401693808,0,3865,22
1,Q1550017551,M3392373099,0,3844,11
2,Q604029601,M2317670257,0,3862,15
3,Q2350061229,M1618461867,0,3849,11
4,Q2443223942,M3544409350,0,3867,4


In [24]:
test.head()

Unnamed: 0,qid,uid,day,hour
0,Q1493039281,M64135255,3870,9
1,Q2023398782,M2536956560,3872,22
2,Q4151338694,M3294926344,3874,15
3,Q3271436624,M3744310794,3873,4
4,Q3314287018,M1349051752,3872,19


In [25]:
with open('pkl/train.pkl', 'wb') as file:
    pickle.dump(train, file)
    
with open('pkl/test.pkl', 'wb') as file:
    pickle.dump(test, file)
    
del train, test
gc.collect()

50

In [26]:
#---------------------member-----------------------------#
user = pd.read_csv(os.path.join(PATH, 'member_info_0926.txt'), 
                          names=['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic'], sep='\t')
user.head()

Unnamed: 0,uid,gender,creat_keyword,level,hot,reg_type,reg_plat,freq,uf_b1,uf_b2,...,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,-1,0.0,0.0,unknown,unknown,monthly,0,1,...,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,T540,"T21107:1.7915097,T405:1.6123838,T4436:1.518003..."
1,M595924114,male,-1,0.0,0.0,unknown,unknown,daily,0,0,...,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"T44126,T15940,T839,T8978,T2934,T1113,T3914,T12...","T18016:2.0650618,T2384:1.2503042,T1142:1.13569..."
2,M1473482940,female,-1,0.0,0.0,unknown,unknown,weekly,0,1,...,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"T30874,T2113,T8656,T21,T523,T8,T116,T5727,T68,...","T46:1.330939,T2159:1.1296458,T379:1.1241927,T1..."
3,M578477092,male,-1,0.0,0.0,unknown,unknown,daily,1,1,...,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"T946,T7323,T297,T2660,T36067,T53107,T2654,T507...","T15918:1.9479566,T8106:1.8578106,T4787:1.58486..."
4,M1088794709,male,-1,0.0,0.0,unknown,unknown,weekly,0,1,...,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"T582,T558,T28776,T5186,T9081,T2419,T2693,T2299...","T65:1.5992582,T867:1.3179373,T93:1.2095009,T31..."


In [27]:

logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

user.columns
#删除了'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat',

Index(['uid', 'gender', 'freq', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5',
       'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
       'inter_topic'],
      dtype='object')

In [28]:
user.head()

Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,1,1,0,1,0,1,0,1190,130,396,1438,1,764,T540,"T21107:1.7915097,T405:1.6123838,T4436:1.518003..."
1,M595924114,1,0,0,0,0,1,1,486,170,0,1311,1,671,"T44126,T15940,T839,T8978,T2934,T1113,T3914,T12...","T18016:2.0650618,T2384:1.2503042,T1142:1.13569..."
2,M1473482940,0,4,0,1,0,1,0,294,190,73,758,1,454,"T30874,T2113,T8656,T21,T523,T8,T116,T5727,T68,...","T46:1.330939,T2159:1.1296458,T379:1.1241927,T1..."
3,M578477092,1,0,1,1,0,1,0,2282,232,237,864,1,588,"T946,T7323,T297,T2660,T36067,T53107,T2654,T507...","T15918:1.9479566,T8106:1.8578106,T4787:1.58486..."
4,M1088794709,1,4,0,1,0,0,0,2113,190,32,758,1,361,"T582,T558,T28776,T5186,T9081,T2419,T2693,T2299...","T65:1.5992582,T867:1.3179373,T93:1.2095009,T31..."


In [29]:
user['follow_topic'] = user['follow_topic'].apply(parse_list_1)
user['inter_topic'] = user['inter_topic'].apply(parse_map)
user = reduce_mem_usage(user)
user.head()

Memory usage of dataframe is 235.80 MB
Memory usage after optimization is: 77.37 MB
Decreased by 67.2%


Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,1,1,0,1,0,1,0,1190,130,396,1438,1,764,[540],"{21107: 1.7915097, 405: 1.6123838, 4436: 1.518..."
1,M595924114,1,0,0,0,0,1,1,486,170,0,1311,1,671,"[44126, 15940, 839, 8978, 2934, 1113, 3914, 12...","{18016: 2.0650618, 2384: 1.2503042, 1142: 1.13..."
2,M1473482940,0,4,0,1,0,1,0,294,190,73,758,1,454,"[30874, 2113, 8656, 21, 523, 8, 116, 5727, 68,...","{46: 1.330939, 2159: 1.1296458, 379: 1.1241927..."
3,M578477092,1,0,1,1,0,1,0,2282,232,237,864,1,588,"[946, 7323, 297, 2660, 36067, 53107, 2654, 507...","{15918: 1.9479566, 8106: 1.8578106, 4787: 1.58..."
4,M1088794709,1,4,0,1,0,0,0,2113,190,32,758,1,361,"[582, 558, 28776, 5186, 9081, 2419, 2693, 2299...","{65: 1.5992582, 867: 1.3179373, 93: 1.2095009,..."


In [30]:
with open('pkl/user.pkl', 'wb') as file:
    pickle.dump(user, file)
    
del user
gc.collect()

0

In [31]:
#---------------------question-----------------------------#
question_info = pd.read_csv(os.path.join(PATH, 'question_info_0926.txt'),
                          names=['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic'], sep='\t')
question_info.head()

Unnamed: 0,qid,q_dt,title_t1,title_t2,desc_t1,desc_t2,topic
0,Q2234111670,D1018-H5,"SW211,SW204,SW1715,SW69,SW2033,SW138,SW57,SW13...","W22414,W963,W10458",-1,-1,"T321,T730,T5784,T4389"
1,Q760329790,D1745-H20,"SW69,SW2033,SW138,SW2616,SW2668,SW36,SW2594,SW...","W12677,W16829,W15201,W6419,W101839","SW146,SW982,SW401,SW297,SW17,SW2616,SW2668,SW3...","W1296,W2118,W12677,W16829,W15201,W6419,W101839...","T278,T12673,T4677"
2,Q741313548,D2032-H21,"SW153,SW662,SW1218,SW853,SW325,SW1056,SW467,SW...","W700,W2781,W3280,W81215","SW1956,SW3583,SW153,SW34,SW35,SW1016,SW586,SW5...","W732,W24400,W48321,W39608,W20788,W219486,W1183...",T226
3,Q3481466230,D2185-H15,"SW22,SW179,SW57,SW451,SW594,SW118,SW882,SW655,...","W3312,W1823,W1505,W638,W166,W461","SW323,SW37,SW1,SW606,SW1227,SW29,SW22,SW179,SW...","W6642,W4214,W3312,W1505,W2205,W232,W294,W7177,...","T51,T4468"
4,Q3966197028,D2269-H17,"SW1622,SW223,SW1218,SW853,SW390,SW220,SW753,SW...","W700,W895,W2253",-1,-1,"T54700,T81,T57,T17670,T43574"


In [32]:
logging.info("ques %s", question_info.shape)

question_info['q_day'] = extract_day(question_info['q_dt'])
question_info['q_hour'] = extract_hour(question_info['q_dt'])
del question_info['q_dt']

In [33]:
question_info['title_t1'] = question_info['title_t1'].apply(parse_list_2)#.apply(sw_lbl_enc.transform).apply(list)
question_info['title_t2'] = question_info['title_t2'].apply(parse_list_1)#.apply(w_lbl_enc.transform).apply(list)
question_info['desc_t1'] = question_info['desc_t1'].apply(parse_list_2)#.apply(sw_lbl_enc.transform).apply(list)
question_info['desc_t2'] = question_info['desc_t2'].apply(parse_list_1)#.apply(w_lbl_enc.transform).apply(list)
question_info['topic'] = question_info['topic'].apply(parse_list_1)# .apply(topic_lbl_enc.transform).apply(list)

gc.collect()
question_info = reduce_mem_usage(question_info)
question_info.head()

Memory usage of dataframe is 111.69 MB
Memory usage after optimization is: 89.00 MB
Decreased by 20.3%


Unnamed: 0,qid,title_t1,title_t2,desc_t1,desc_t2,topic,q_day,q_hour
0,Q2234111670,"[211, 204, 1715, 69, 2033, 138, 57, 138, 8, 28...","[22414, 963, 10458]",[0],[0],"[321, 730, 5784, 4389]",1018,5
1,Q760329790,"[69, 2033, 138, 2616, 2668, 36, 2594, 1165, 20...","[12677, 16829, 15201, 6419, 101839]","[146, 982, 401, 297, 17, 2616, 2668, 36, 2594,...","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20
2,Q741313548,"[153, 662, 1218, 853, 325, 1056, 467, 398, 102...","[700, 2781, 3280, 81215]","[1956, 3583, 153, 34, 35, 1016, 586, 586, 716,...","[732, 24400, 48321, 39608, 20788, 219486, 1183...",[226],2032,21
3,Q3481466230,"[22, 179, 57, 451, 594, 118, 882, 655, 1, 433,...","[3312, 1823, 1505, 638, 166, 461]","[323, 37, 1, 606, 1227, 29, 22, 179, 7, 44, 27...","[6642, 4214, 3312, 1505, 2205, 232, 294, 7177,...","[51, 4468]",2185,15
4,Q3966197028,"[1622, 223, 1218, 853, 390, 220, 753, 909, 557...","[700, 895, 2253]",[0],[0],"[54700, 81, 57, 17670, 43574]",2269,17


In [34]:
with open('pkl/question_info.pkl', 'wb') as file:
    pickle.dump(question_info, file)
    
del question_info
gc.collect()

0

In [50]:
%%time
#---------------------answer-----------------------------#


answer_info = pd.read_csv(os.path.join(PATH, 'answer_info_0926.txt'), 
                          names=['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis'], sep='\t')
answer_info.head()


CPU times: user 1min, sys: 5.59 s, total: 1min 5s
Wall time: 1min 5s


Unnamed: 0,aid,qid,uid,ans_dt,ans_t1,ans_t2,is_good,is_rec,is_dest,has_img,has_video,word_count,reci_cheer,reci_uncheer,reci_comment,reci_mark,reci_tks,reci_xxx,reci_no_help,reci_dis
0,A2502060945,Q1867533817,M625498202,D3808-H7,"SW13,SW19,SW44,SW150,SW23,SW594,SW1254,SW91,SW...","W239,W10528,W142,W20372,W6473,W10,W24,W4527,W2...",0,0,0,0,0,41,1,0,1,0,1,0,0,0
1,A2847829478,Q3366788616,M142330444,D3810-H17,"SW898,SW3656,SW2,SW413,SW601,SW2,SW2541,SW681,...","W4628,W66060,W1607,W2647,W53385,W109029,W319,W...",0,0,0,0,0,204,1,0,0,3,1,0,0,0
2,A2005999231,Q4264694221,M771499642,D3853-H13,"SW9,SW510,SW573,SW1348,SW1304,SW326,SW509,SW22...","W3532,W69108,W12395,W81311,W498,W850,W107789,W...",0,0,0,0,0,54,2,0,1,0,0,0,0,0
3,A14821523,Q1088851650,M2282072267,D3859-H18,"SW7,SW16,SW5,SW144,SW302,SW20,SW1752,SW60,SW75...","W1425,W1676,W3700,W325,W6485,W1090,W8342,W9689...",0,0,0,0,0,42,1,0,1,0,1,0,0,0
4,A731550034,Q1023877868,M2282072267,D3855-H22,"SW7,SW15,SW596,SW407,SW1698,SW68,SW2,SW56,SW59...","W23420,W2610,W5506,W1489,W8749,W504,W1718,W252...",0,0,0,0,0,44,0,0,0,0,0,0,0,0


In [53]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

# 非纯数字组成的字符串强转为整型会报错：ValueError: invalid literal for int() with base 10: ''
answer_info['ans_t1'] = answer_info['ans_t1'].apply(parse_list_2)
answer_info['ans_t2'] = answer_info['ans_t2'].apply(parse_list_1) 

logging.info("ans %s", answer_info.shape)

answer_info['a_day'] = extract_day(answer_info['ans_dt'])
answer_info['a_hour'] = extract_hour(answer_info['ans_dt'])
del answer_info['ans_dt']

# 回答距提问的天数
# answer_info['diff_qa_days'] = answer_info['a_day'] - answer_info['q_day']

gc.collect()
answer_info = reduce_mem_usage(answer_info)
answer_info.head()

ValueError: invalid literal for int() with base 10: ''

In [54]:
with open('pkl/answer_info.pkl', 'wb') as file:
    pickle.dump(answer_info, file)

del answer_info
gc.collect()
toc = time.time()
print('Used time: %d' % int(toc-tic))

Used time: 4614
