In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import multiprocessing as mp

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
tic = time.time()

In [5]:
SAVE_PATH = '../feats'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)

### member_info: 用户特征

In [6]:
with open('../pkl/member_info.pkl', 'rb') as file:
    member_info = pickle.load(file)
member_info.head(2)

Unnamed: 0,author_id,gender,keyword,grade,hotness,reg_type,reg_plat,freq,A1,B1,...,D1,E1,A2,B2,C2,D2,E2,score,topic_attent,topic_interest
0,M1934753188,male,-1,0.0,0.0,unknown,unknown,monthly,0,1,...,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,[540],"{21107: 1.7915097, 405: 1.6123838, 4436: 1.518..."
1,M595924114,male,-1,0.0,0.0,unknown,unknown,daily,0,0,...,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"[44126, 15940, 839, 8978, 2934, 1113, 3914, 12...","{18016: 2.0650618, 2384: 1.2503042, 1142: 1.13..."


In [7]:
# 原始类别特征
member_cat_feats = ['gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2']
for feat in member_cat_feats:
    member_info[feat] = LabelEncoder().fit_transform(member_info[feat])

In [8]:
# 用户关注和感兴趣的topic数
member_info['num_atten_topic'] = member_info['topic_attent'].apply(len)
member_info['num_interest_topic'] = member_info['topic_interest'].apply(len)

In [9]:
def most_interest_topic(d):
    if len(d) == 0:
        return -1
    return list(d.keys())[np.argmax(list(d.values()))]

In [10]:
# 用户最感兴趣的topic
member_info['most_interest_topic'] = member_info['topic_interest'].apply(most_interest_topic)
member_info['most_interest_topic'] = LabelEncoder().fit_transform(member_info['most_interest_topic'])

In [11]:
def get_interest_values(d):
    if len(d) == 0:
        return [0]
    return list(d.values())

In [12]:
# 用户topic兴趣值的统计特征
member_info['interest_values'] = member_info['topic_interest'].apply(get_interest_values)
member_info['min_interest_values'] = member_info['interest_values'].apply(np.min)
member_info['max_interest_values'] = member_info['interest_values'].apply(np.max)
member_info['mean_interest_values'] = member_info['interest_values'].apply(np.mean)
member_info['std_interest_values'] = member_info['interest_values'].apply(np.std)

In [13]:
# 汇总
feats = ['author_id', 'gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'score']
feats += ['num_atten_topic', 'num_interest_topic', 'most_interest_topic']
feats += ['min_interest_values', 'max_interest_values', 'mean_interest_values', 'std_interest_values']
member_feat = member_info[feats]

In [14]:
member_feat.head(3)

Unnamed: 0,author_id,gender,freq,A1,B1,C1,D1,E1,A2,B2,...,D2,E2,score,num_atten_topic,num_interest_topic,most_interest_topic,min_interest_values,max_interest_values,mean_interest_values,std_interest_values
0,M1934753188,1,1,0,1,0,1,0,1190,130,...,1438,1,764,1,10,14964,1.326969,1.79151,1.459742,0.139273
1,M595924114,1,0,0,0,0,1,1,486,170,...,1311,1,671,30,10,13662,0.851706,2.065062,1.114788,0.339216
2,M1473482940,0,4,0,1,0,1,0,294,190,...,758,1,454,24,10,45,0.811265,1.330939,1.010897,0.151768


In [15]:
member_feat.to_hdf('./feats/member_feat.h5', key='data')

del member_feat, member_info
gc.collect()

55

### question_info: 问题特征

In [16]:
with open('../pkl/question_info.pkl', 'rb') as file:
    question_info = pickle.load(file)
    
question_info.head(2)

Unnamed: 0,question_id,title_sw_series,title_w_series,desc_sw_series,desc_w_series,topic,question_day,question_hour
0,Q2234111670,"[211, 204, 1715, 69, 2033, 138, 57, 138, 8, 28...","[22414, 963, 10458]",[0],[0],"[321, 730, 5784, 4389]",1018,5
1,Q760329790,"[69, 2033, 138, 2616, 2668, 36, 2594, 1165, 20...","[12677, 16829, 15201, 6419, 101839]","[146, 982, 401, 297, 17, 2616, 2668, 36, 2594,...","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20


In [17]:
# title、desc词计数，topic计数
question_info['num_title_sw'] = question_info['title_sw_series'].apply(len)
question_info['num_title_w'] = question_info['title_w_series'].apply(len)
question_info['num_desc_sw'] = question_info['desc_sw_series'].apply(len)
question_info['num_desc_w'] = question_info['desc_w_series'].apply(len)
question_info['num_qtopic'] = question_info['topic'].apply(len)

In [18]:
feats = ['question_id', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'question_hour']
feats += []
question_feat = question_info[feats]

In [19]:
question_feat.head(3)

Unnamed: 0,question_id,num_title_sw,num_title_w,num_desc_sw,num_desc_w,num_qtopic,question_hour
0,Q2234111670,13,3,1,1,4,5
1,Q760329790,17,5,42,13,3,20
2,Q741313548,13,4,68,10,1,21


In [20]:
question_feat.to_hdf('./feats/question_feat.h5', key='data')

In [21]:
del question_info, question_feat
gc.collect()

44

### member_info & question_info: 用户和问题的交互特征

In [22]:
with open('../pkl/invite_info.pkl', 'rb') as file:
    invite_info = pickle.load(file)
with open('../pkl/invite_info_evaluate.pkl', 'rb') as file:
    invite_info_evaluate = pickle.load(file)
with open('../pkl/member_info.pkl', 'rb') as file:
    member_info = pickle.load(file)
with open('../pkl/question_info.pkl', 'rb') as file:
    question_info = pickle.load(file)

In [23]:
# 合并 author_id，question_id
invite = pd.concat([invite_info, invite_info_evaluate])
invite_id = invite[['author_id', 'question_id']]
invite_id['author_question_id'] = invite_id['author_id'] + invite_id['question_id']
invite_id.drop_duplicates(subset='author_question_id',inplace=True)
invite_id_qm = invite_id.merge(member_info[['author_id', 'topic_attent', 'topic_interest']], 'left', 'author_id').merge(question_info[['question_id', 'topic']], 'left', 'question_id')
invite_id_qm.head(2)

Unnamed: 0,author_id,question_id,author_question_id,topic_attent,topic_interest,topic
0,M401693808,Q2166419046,M401693808Q2166419046,"[1727, 5310, 3402, 916, 1506, 26329, 7293, 180...","{2794: 1.1527717, 9701: 0.94830835, 9533: 0.85...","[456, 112, 9566, 5310]"
1,M3392373099,Q1550017551,M3392373099Q1550017551,"[42595, 3, 8520, 597, 6485, 6212, 25664, 148, ...","{1470: 1.4682752, 235: 1.4412646, 4692: 1.3516...","[2, 3095]"


#### 注：这里为了加快运算，所以用了多进程 multiprocessing，windows + multiprocessing + jupyter可能有bug，建议linux上跑。

In [24]:
# 分割 df，方便多进程跑
def split_df(df, n):
    chunk_size = int(np.ceil(len(df) / n))
    return [df[i*chunk_size:(i+1)*chunk_size] for i in range(n)]

In [25]:
def gc_mp(pool, ret, chunk_list):
    del pool
    for r in ret:
        del r
    del ret
    for cl in chunk_list:
        del cl
    del chunk_list
    gc.collect()

In [26]:
# 用户关注topic和问题 topic的交集
def process(df):
    return df.apply(lambda row: list(set(row['topic_attent']) & set(row['topic'])),axis=1)

pool = mp.Pool()
chunk_list = split_df(invite_id_qm, 100)
ret = pool.map(process, chunk_list)
invite_id_qm['topic_attent_intersection'] = pd.concat(ret)
gc_mp(pool, ret, chunk_list)

In [27]:
# 用户感兴趣topic和问题 topic的交集
def process(df):
    return df.apply(lambda row: list(set(row['topic_interest'].keys()) & set(row['topic'])),axis=1)

pool = mp.Pool()
chunk_list = split_df(invite_id_qm, 100)
ret = pool.map(process, chunk_list)
invite_id_qm['topic_interest_intersection'] = pd.concat(ret)
gc_mp(pool, ret, chunk_list)

In [28]:
# 用户感兴趣topic和问题 topic的交集的兴趣值
def process(df):
    return df.apply(lambda row: [row['topic_interest'][t] for t in row['topic_interest_intersection']],axis=1)

pool = mp.Pool()
chunk_list = split_df(invite_id_qm, 100)
ret = pool.map(process, chunk_list)
invite_id_qm['topic_interest_intersection_values'] = pd.concat(ret)
gc_mp(pool, ret, chunk_list)

In [29]:
# 交集topic计数
invite_id_qm['num_topic_attent_intersection'] = invite_id_qm['topic_attent_intersection'].apply(len)
invite_id_qm['num_topic_interest_intersection'] = invite_id_qm['topic_interest_intersection'].apply(len)

In [30]:
# 交集topic兴趣值统计
invite_id_qm['topic_interest_intersection_values'] = invite_id_qm['topic_interest_intersection_values'].apply(lambda x: [0] if len(x) == 0 else x)
invite_id_qm['min_topic_interest_intersection_values'] = invite_id_qm['topic_interest_intersection_values'].apply(np.min)
invite_id_qm['max_topic_interest_intersection_values'] = invite_id_qm['topic_interest_intersection_values'].apply(np.max)
invite_id_qm['mean_topic_interest_intersection_values'] = invite_id_qm['topic_interest_intersection_values'].apply(np.mean)
invite_id_qm['std_topic_interest_intersection_values'] = invite_id_qm['topic_interest_intersection_values'].apply(np.std)

In [31]:
feats = ['author_question_id', 'num_topic_attent_intersection', 'num_topic_interest_intersection', 'min_topic_interest_intersection_values', 'max_topic_interest_intersection_values', 'mean_topic_interest_intersection_values', 'std_topic_interest_intersection_values']
feats += []
member_question_feat = invite_id_qm[feats]
member_question_feat.head(3)

Unnamed: 0,author_question_id,num_topic_attent_intersection,num_topic_interest_intersection,min_topic_interest_intersection_values,max_topic_interest_intersection_values,mean_topic_interest_intersection_values,std_topic_interest_intersection_values
0,M401693808Q2166419046,1,0,0.0,0.0,0.0,0.0
1,M3392373099Q1550017551,0,0,0.0,0.0,0.0,0.0
2,M2317670257Q604029601,0,0,0.0,0.0,0.0,0.0


In [32]:
member_question_feat.to_hdf('./feats/member_question_feat.h5', key='data')

In [33]:
del invite_id_qm, member_question_feat
gc.collect()

44

In [34]:
toc = time.time()
print('Used time: %d' % int(toc-tic))

Used time: 1293
