In [2]:
import pandas as pd
import numpy as np
import pickle
import gc
from tqdm import tqdm_notebook
import os
import time

In [3]:
tic = time.time()

In [4]:
# 减少内存占用
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
# 解析列表， 重编码id
def parse_str(d):  # 给emb用的
    return np.array(list(map(float, d.split())))

def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))

def parse_list_2(d): # single word用
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[2:]), str(d).split(',')))

def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))

In [6]:
PATH = '../data_set_0926'
SAVE_PATH = '../pkl'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)

### single word

In [7]:
single_word = pd.read_csv(os.path.join(PATH, 'single_word_vectors_64d.txt'), 
                          names=['id', 'embed'], sep='\t')
single_word.head()

Unnamed: 0,id,embed
0,SW1,-0.985937 0.11307016 0.012898494 -0.6822068 -0...
1,SW2,-0.3367663 0.039051324 0.8155926 0.8351733 -0....
2,SW3,0.3074205 -1.0977745 0.7528213 0.6299011 0.197...
3,SW4,0.61494493 0.5444025 2.0673835 3.2731245 0.779...
4,SW5,-1.0922098 -2.088952 -1.9467407 -0.095274135 0...


In [8]:
single_word['embed'] = single_word['embed'].apply(parse_str)
single_word['id'] = single_word['id'].apply(lambda x: int(x[2:]))
single_word.head()

Unnamed: 0,id,embed
0,1,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
1,2,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."
2,3,"[0.3074205, -1.0977745, 0.7528213, 0.6299011, ..."
3,4,"[0.61494493, 0.5444025, 2.0673835, 3.2731245, ..."
4,5,"[-1.0922098, -2.088952, -1.9467407, -0.0952741..."


In [9]:
with open('../pkl/single_word.pkl', 'wb') as file:
    pickle.dump(single_word, file)

del single_word
gc.collect()

0

### word

In [10]:
word = pd.read_csv(os.path.join(PATH, 'word_vectors_64d.txt'), 
                          names=['id', 'embed'], sep='\t')
word.head()

Unnamed: 0,id,embed
0,W1,0.12561196 -0.57268924 -0.14478925 -0.05249426...
1,W2,3.224765 2.2482696 -0.511986 -0.5329892 -0.943...
2,W3,-0.985937 0.11307016 0.012898494 -0.6822068 -0...
3,W4,-0.3367663 0.039051324 0.8155926 0.8351733 -0....
4,W5,0.3074205 -1.0977745 0.7528213 0.6299011 0.197...


In [11]:
word['embed'] = word['embed'].apply(parse_str)
word['id'] = word['id'].apply(lambda x: int(x[1:]))
word.head()

Unnamed: 0,id,embed
0,1,"[0.12561196, -0.57268924, -0.14478925, -0.0524..."
1,2,"[3.224765, 2.2482696, -0.511986, -0.5329892, -..."
2,3,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
3,4,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."
4,5,"[0.3074205, -1.0977745, 0.7528213, 0.6299011, ..."


In [12]:
with open('../pkl/word.pkl', 'wb') as file:
    pickle.dump(word, file)
    
del word
gc.collect()

0

### topic

In [13]:
topic = pd.read_csv(os.path.join(PATH, 'topic_vectors_64d.txt'), 
                          names=['id', 'embed'], sep='\t')
topic.head()

Unnamed: 0,id,embed
0,T1,0.16508673 -0.0037432343 -0.058245048 -0.00134...
1,T2,1.608256 -1.0515573 -1.1897708 1.1820835 -0.80...
2,T3,3.3307428 -0.43252096 -2.1518784 -1.4390031 2....
3,T4,2.4698818 -0.12998039 -0.4648351 0.8796743 -0....
4,T5,1.562477 -1.3560516 -0.3271215 -0.063419074 -0...


In [14]:
topic['embed'] = topic['embed'].apply(parse_str)
topic['id'] = topic['id'].apply(lambda x: int(x[1:]))
topic.head()

Unnamed: 0,id,embed
0,1,"[0.16508673, -0.0037432343, -0.058245048, -0.0..."
1,2,"[1.608256, -1.0515573, -1.1897708, 1.1820835, ..."
2,3,"[3.3307428, -0.43252096, -2.1518784, -1.439003..."
3,4,"[2.4698818, -0.12998039, -0.4648351, 0.8796743..."
4,5,"[1.562477, -1.3560516, -0.3271215, -0.06341907..."


In [15]:
with open('../pkl/topic.pkl', 'wb') as file:
    pickle.dump(topic, file)
    
del topic
gc.collect()

0

### invite

In [16]:
invite_info = pd.read_csv(os.path.join(PATH, 'invite_info_0926.txt'), 
                          names=['question_id', 'author_id', 'invite_time', 'label'], sep='\t')
invite_info_evaluate = pd.read_csv(os.path.join(PATH, 'invite_info_evaluate_1_0926.txt'), 
                          names=['question_id', 'author_id', 'invite_time'], sep='\t')
invite_info.head()

Unnamed: 0,question_id,author_id,invite_time,label
0,Q2166419046,M401693808,D3865-H22,0
1,Q1550017551,M3392373099,D3844-H11,0
2,Q604029601,M2317670257,D3862-H15,0
3,Q2350061229,M1618461867,D3849-H11,0
4,Q2443223942,M3544409350,D3867-H4,0


In [17]:
invite_info['invite_day'] = invite_info['invite_time'].apply(lambda x: int(x.split('-')[0][1:])).astype(np.int16)
invite_info['invite_hour'] = invite_info['invite_time'].apply(lambda x: int(x.split('-')[1][1:])).astype(np.int8)

In [18]:
invite_info_evaluate['invite_day'] = invite_info_evaluate['invite_time'].apply(lambda x: int(x.split('-')[0][1:])).astype(np.int16)
invite_info_evaluate['invite_hour'] = invite_info_evaluate['invite_time'].apply(lambda x: int(x.split('-')[1][1:])).astype(np.int8)

In [19]:
invite_info = reduce_mem_usage(invite_info)

Memory usage of dataframe is 316.74 MB
Memory usage after optimization is: 253.39 MB
Decreased by 20.0%


In [20]:
with open('../pkl/invite_info.pkl', 'wb') as file:
    pickle.dump(invite_info, file)
    
with open('../pkl/invite_info_evaluate.pkl', 'wb') as file:
    pickle.dump(invite_info_evaluate, file)
    
del invite_info, invite_info_evaluate
gc.collect()

0

### member

In [21]:
member_info = pd.read_csv(os.path.join(PATH, 'member_info_0926.txt'), 
                          names=['author_id', 'gender', 'keyword', 'grade', 'hotness', 'reg_type','reg_plat','freq',
                                 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2',
                                 'score', 'topic_attent', 'topic_interest'], sep='\t')
member_info.head()

Unnamed: 0,author_id,gender,keyword,grade,hotness,reg_type,reg_plat,freq,A1,B1,...,D1,E1,A2,B2,C2,D2,E2,score,topic_attent,topic_interest
0,M1934753188,male,-1,0.0,0.0,unknown,unknown,monthly,0,1,...,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,T540,"T21107:1.7915097,T405:1.6123838,T4436:1.518003..."
1,M595924114,male,-1,0.0,0.0,unknown,unknown,daily,0,0,...,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"T44126,T15940,T839,T8978,T2934,T1113,T3914,T12...","T18016:2.0650618,T2384:1.2503042,T1142:1.13569..."
2,M1473482940,female,-1,0.0,0.0,unknown,unknown,weekly,0,1,...,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"T30874,T2113,T8656,T21,T523,T8,T116,T5727,T68,...","T46:1.330939,T2159:1.1296458,T379:1.1241927,T1..."
3,M578477092,male,-1,0.0,0.0,unknown,unknown,daily,1,1,...,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"T946,T7323,T297,T2660,T36067,T53107,T2654,T507...","T15918:1.9479566,T8106:1.8578106,T4787:1.58486..."
4,M1088794709,male,-1,0.0,0.0,unknown,unknown,weekly,0,1,...,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"T582,T558,T28776,T5186,T9081,T2419,T2693,T2299...","T65:1.5992582,T867:1.3179373,T93:1.2095009,T31..."


In [22]:
member_info['topic_attent'] = member_info['topic_attent'].apply(parse_list_1)
member_info['topic_interest'] = member_info['topic_interest'].apply(parse_map)

In [23]:
member_info = reduce_mem_usage(member_info)

Memory usage of dataframe is 309.48 MB
Memory usage after optimization is: 221.06 MB
Decreased by 28.6%


In [24]:
with open('../pkl/member_info.pkl', 'wb') as file:
    pickle.dump(member_info, file)
    
del member_info
gc.collect()

0

### question

In [25]:
question_info = pd.read_csv(os.path.join(PATH, 'question_info_0926.txt'),
                          names=['question_id', 'question_time', 'title_sw_series', 'title_w_series', 'desc_sw_series', 'desc_w_series', 'topic'], sep='\t')
question_info.head()

Unnamed: 0,question_id,question_time,title_sw_series,title_w_series,desc_sw_series,desc_w_series,topic
0,Q2234111670,D1018-H5,"SW211,SW204,SW1715,SW69,SW2033,SW138,SW57,SW13...","W22414,W963,W10458",-1,-1,"T321,T730,T5784,T4389"
1,Q760329790,D1745-H20,"SW69,SW2033,SW138,SW2616,SW2668,SW36,SW2594,SW...","W12677,W16829,W15201,W6419,W101839","SW146,SW982,SW401,SW297,SW17,SW2616,SW2668,SW3...","W1296,W2118,W12677,W16829,W15201,W6419,W101839...","T278,T12673,T4677"
2,Q741313548,D2032-H21,"SW153,SW662,SW1218,SW853,SW325,SW1056,SW467,SW...","W700,W2781,W3280,W81215","SW1956,SW3583,SW153,SW34,SW35,SW1016,SW586,SW5...","W732,W24400,W48321,W39608,W20788,W219486,W1183...",T226
3,Q3481466230,D2185-H15,"SW22,SW179,SW57,SW451,SW594,SW118,SW882,SW655,...","W3312,W1823,W1505,W638,W166,W461","SW323,SW37,SW1,SW606,SW1227,SW29,SW22,SW179,SW...","W6642,W4214,W3312,W1505,W2205,W232,W294,W7177,...","T51,T4468"
4,Q3966197028,D2269-H17,"SW1622,SW223,SW1218,SW853,SW390,SW220,SW753,SW...","W700,W895,W2253",-1,-1,"T54700,T81,T57,T17670,T43574"


In [26]:
question_info['title_sw_series'] = question_info['title_sw_series'].apply(parse_list_2)#.apply(sw_lbl_enc.transform).apply(list)
question_info['title_w_series'] = question_info['title_w_series'].apply(parse_list_1)#.apply(w_lbl_enc.transform).apply(list)
question_info['desc_sw_series'] = question_info['desc_sw_series'].apply(parse_list_2)#.apply(sw_lbl_enc.transform).apply(list)
question_info['desc_w_series'] = question_info['desc_w_series'].apply(parse_list_1)#.apply(w_lbl_enc.transform).apply(list)
question_info['topic'] = question_info['topic'].apply(parse_list_1)# .apply(topic_lbl_enc.transform).apply(list)
question_info.head()

Unnamed: 0,question_id,question_time,title_sw_series,title_w_series,desc_sw_series,desc_w_series,topic
0,Q2234111670,D1018-H5,"[211, 204, 1715, 69, 2033, 138, 57, 138, 8, 28...","[22414, 963, 10458]",[0],[0],"[321, 730, 5784, 4389]"
1,Q760329790,D1745-H20,"[69, 2033, 138, 2616, 2668, 36, 2594, 1165, 20...","[12677, 16829, 15201, 6419, 101839]","[146, 982, 401, 297, 17, 2616, 2668, 36, 2594,...","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]"
2,Q741313548,D2032-H21,"[153, 662, 1218, 853, 325, 1056, 467, 398, 102...","[700, 2781, 3280, 81215]","[1956, 3583, 153, 34, 35, 1016, 586, 586, 716,...","[732, 24400, 48321, 39608, 20788, 219486, 1183...",[226]
3,Q3481466230,D2185-H15,"[22, 179, 57, 451, 594, 118, 882, 655, 1, 433,...","[3312, 1823, 1505, 638, 166, 461]","[323, 37, 1, 606, 1227, 29, 22, 179, 7, 44, 27...","[6642, 4214, 3312, 1505, 2205, 232, 294, 7177,...","[51, 4468]"
4,Q3966197028,D2269-H17,"[1622, 223, 1218, 853, 390, 220, 753, 909, 557...","[700, 895, 2253]",[0],[0],"[54700, 81, 57, 17670, 43574]"


In [27]:
question_info['question_day'] = question_info['question_time'].apply(lambda x: int(x.split('-')[0][1:])).astype(np.int16)
question_info['question_hour'] = question_info['question_time'].apply(lambda x: int(x.split('-')[1][1:])).astype(np.int8)
del question_info['question_time']
gc.collect()

0

In [28]:
question_info = reduce_mem_usage(question_info)

Memory usage of dataframe is 89.00 MB
Memory usage after optimization is: 89.00 MB
Decreased by 0.0%


In [29]:
with open('../pkl/question_info.pkl', 'wb') as file:
    pickle.dump(question_info, file)
    
del question_info
gc.collect()

0

### answer

In [30]:
%%time
answer_info = pd.read_csv(os.path.join(PATH, 'answer_info_0926.txt'), 
                          names=['answer_id', 'question_id', 'author_id', 'answer_time', 'content_sw_series', 'content_w_series', 
                                 'excellent', 'recommend', 'round_table', 'figure', 'video', 
                                 'num_word', 'num_like', 'num_unlike', 'num_comment',
                                 'num_favor', 'num_thank', 'num_report', 'num_nohelp', 'num_oppose'], sep='\t')
answer_info.head()

CPU times: user 1min 11s, sys: 7.66 s, total: 1min 19s
Wall time: 1min 42s


In [31]:
answer_info['content_sw_series'] = answer_info['content_sw_series'].apply(parse_list_2) 
answer_info['content_w_series'] = answer_info['content_w_series'].apply(parse_list_1) 
answer_info.head()

Unnamed: 0,answer_id,question_id,author_id,answer_time,content_sw_series,content_w_series,excellent,recommend,round_table,figure,video,num_word,num_like,num_unlike,num_comment,num_favor,num_thank,num_report,num_nohelp,num_oppose
0,A2502060945,Q1867533817,M625498202,D3808-H7,"[13, 19, 44, 150, 23, 594, 1254, 91, 3, 87, 48...","[239, 10528, 142, 20372, 6473, 10, 24, 4527, 2...",0,0,0,0,0,41,1,0,1,0,1,0,0,0
1,A2847829478,Q3366788616,M142330444,D3810-H17,"[898, 3656, 2, 413, 601, 2, 2541, 681, 2, 4368...","[4628, 66060, 1607, 2647, 53385, 109029, 319, ...",0,0,0,0,0,204,1,0,0,3,1,0,0,0
2,A2005999231,Q4264694221,M771499642,D3853-H13,"[9, 510, 573, 1348, 1304, 326, 509, 221, 40, 1...","[3532, 69108, 12395, 81311, 498, 850, 107789, ...",0,0,0,0,0,54,2,0,1,0,0,0,0,0
3,A14821523,Q1088851650,M2282072267,D3859-H18,"[7, 16, 5, 144, 302, 20, 1752, 60, 75, 75, 655...","[1425, 1676, 3700, 325, 6485, 1090, 8342, 9689...",0,0,0,0,0,42,1,0,1,0,1,0,0,0
4,A731550034,Q1023877868,M2282072267,D3855-H22,"[7, 15, 596, 407, 1698, 68, 2, 56, 598, 544, 6...","[23420, 2610, 5506, 1489, 8749, 504, 1718, 252...",0,0,0,0,0,44,0,0,0,0,0,0,0,0


In [32]:
answer_info['answer_day'] = answer_info['answer_time'].apply(lambda x: int(x.split('-')[0][1:])).astype(np.int16)
answer_info['answer_hour'] = answer_info['answer_time'].apply(lambda x: int(x.split('-')[1][1:])).astype(np.int8)
del answer_info['answer_time']
gc.collect()

0

In [33]:
answer_info = reduce_mem_usage(answer_info)

Memory usage of dataframe is 667.22 MB
Memory usage after optimization is: 318.54 MB
Decreased by 52.3%


In [34]:
with open('../pkl/answer_info.pkl', 'wb') as file:
    pickle.dump(answer_info, file)

del answer_info
gc.collect()

0

In [35]:
toc = time.time()

In [36]:
print('Used time: %d' % int(toc-tic))

Used time: 1137


In [37]:
with open('../pkl/answer_info.pkl', 'rb') as file:
    answer_info = pickle.load(file)

In [46]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth',100)

In [47]:
answer_info.head()

Unnamed: 0,answer_id,question_id,author_id,content_sw_series,content_w_series,excellent,recommend,round_table,figure,video,num_word,num_like,num_unlike,num_comment,num_favor,num_thank,num_report,num_nohelp,num_oppose,answer_day,answer_hour
0,A2502060945,Q1867533817,M625498202,"[13, 19, 44, 150, 23, 594, 1254, 91, 3, 87, 48, 13, 1, 1144, 429, 6606, 3240, 1348, 8, 49, 8, 13...","[239, 10528, 142, 20372, 6473, 10, 24, 4527, 20372, 13100, 147, 1491, 10, 24, 4527, 13100]",0,0,0,0,0,41,1,0,1,0,1,0,0,0,3808,7
1,A2847829478,Q3366788616,M142330444,"[898, 3656, 2, 413, 601, 2, 2541, 681, 2, 4368, 601, 2, 18, 148, 184, 453, 1186, 3, 413, 601, 16...","[4628, 66060, 1607, 2647, 53385, 109029, 319, 443, 4092, 1607, 2647, 6138, 4628, 66060, 11864, 3...",0,0,0,0,0,204,1,0,0,3,1,0,0,0,3810,17
2,A2005999231,Q4264694221,M771499642,"[9, 510, 573, 1348, 1304, 326, 509, 221, 40, 153, 896, 1685, 42, 195, 2, 274, 349, 72, 1206, 141...","[3532, 69108, 12395, 81311, 498, 850, 107789, 31394, 2608, 878, 9429, 5367, 2368, 2344, 2852, 2370]",0,0,0,0,0,54,2,0,1,0,0,0,0,0,3853,13
3,A14821523,Q1088851650,M2282072267,"[7, 16, 5, 144, 302, 20, 1752, 60, 75, 75, 655, 3863, 28, 26, 2, 536, 853, 256, 118, 503, 25, 16...","[1425, 1676, 3700, 325, 6485, 1090, 8342, 9689, 3727]",0,0,0,0,0,42,1,0,1,0,1,0,0,0,3859,18
4,A731550034,Q1023877868,M2282072267,"[7, 15, 596, 407, 1698, 68, 2, 56, 598, 544, 684, 3, 2928, 732, 91, 410, 767, 1432, 16, 66, 2, 1...","[23420, 2610, 5506, 1489, 8749, 504, 1718, 2525, 155, 9867, 452, 703]",0,0,0,0,0,44,0,0,0,0,0,0,0,0,3855,22


In [39]:
answer_info.shape

(4513735, 21)

In [49]:
answer_info['num_nohelp'].value_counts()

0       4464217
1         38183
2          5923
3          2149
4          1059
5           625
6           419
7           265
8           183
9           133
10          102
11           69
12           58
13           44
16           43
14           33
15           31
19           25
17           18
22           12
21           11
24           11
18           10
23           10
28            8
20            8
25            7
29            7
32            7
31            6
27            6
34            5
30            5
26            4
33            3
37            3
45            3
40            2
53            2
48            2
46            2
38            2
36            2
35            2
77            1
68            1
102           1
63            1
67            1
177           1
66            1
43            1
58            1
57            1
47            1
42            1
41            1
39            1
845           1
2065          1
Name: num_nohelp, dtype: int64

In [44]:
answer_info['recommend'].value_counts()

0    4513621
1        114
Name: recommend, dtype: int64

In [43]:
answer_info['round_table'].value_counts()

0    4513735
Name: round_table, dtype: int64