In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import pickle

base_path = '/home/hisense/hzy/data'

import warnings
warnings.filterwarnings('ignore')


def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))

def parse_str(d):
    return np.array(list(map(float, d.split())))


In [2]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
#[问题ID  问题创建时间  问题标题的单字编码序列  问题标题的切词编码序列  问题描述的单字编码序列  问题描述的词编码序列  问题绑定的话题ID]
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']

print(ques.shape)
ques.head()


(1829900, 4)


Unnamed: 0,qid,topic,q_day,q_hour
0,Q2234111670,"T321,T730,T5784,T4389",1018,5
1,Q760329790,"T278,T12673,T4677",1745,20
2,Q741313548,T226,2032,21
3,Q3481466230,"T51,T4468",2185,15
4,Q3966197028,"T54700,T81,T57,T17670,T43574",2269,17


In [4]:
####

topic = pd.read_csv('/home/hisense/hzy/data/topic_vectors_64d.txt', names=['id', 'embed'], sep='\t')
topic['embed'] = topic['embed'].apply(parse_str)
topic['id'] = topic['id'].apply(lambda x: int(x[1:]))
topic.head()

Unnamed: 0,id,embed
0,1,"[0.16508673, -0.0037432343, -0.058245048, -0.0..."
1,2,"[1.608256, -1.0515573, -1.1897708, 1.1820835, ..."
2,3,"[3.3307428, -0.43252096, -2.1518784, -1.439003..."
3,4,"[2.4698818, -0.12998039, -0.4648351, 0.8796743..."
4,5,"[1.562477, -1.3560516, -0.3271215, -0.06341907..."


In [5]:
topic_vector_dict = dict(zip(np.array(topic['id']), np.array(topic['embed'])))
type(topic_vector_dict.keys())

dict_keys

In [6]:
def topic2v(x):
    try:
        tmp = topic_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + topic_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [7]:

ques['topic'] = ques['topic'].apply(parse_list_1)

In [8]:

from tqdm import tqdm, tqdm_notebook, _tqdm_notebook, tqdm_pandas

tqdm.pandas(desc="topic2v...")
ques['topic_attent_vector']=ques['topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

topic2v...: 100%|██████████| 1829900/1829900 [00:53<00:00, 34346.65it/s]

finished!





In [9]:
def listi(x,i):
    return x[i]
for i in range(64):
    col_name = 'topic_vector_{}'.format(str(i))
    tqdm.pandas(desc="topic_2v...")
    ques[col_name]=ques['topic_attent_vector'].apply(lambda x:listi(x,i))
        

In [11]:
del ques['topic_attent_vector'], ques['topic']
ques.head()

Unnamed: 0,qid,q_day,q_hour,topic_vector_0,topic_vector_1,topic_vector_2,topic_vector_3,topic_vector_4,topic_vector_5,topic_vector_6,...,topic_vector_54,topic_vector_55,topic_vector_56,topic_vector_57,topic_vector_58,topic_vector_59,topic_vector_60,topic_vector_61,topic_vector_62,topic_vector_63
0,Q2234111670,1018,5,-1.121143,2.920241,-0.097328,-2.640352,1.895282,-1.84664,-6.146694,...,-2.036547,4.687829,-1.446055,-6.856635,3.313601,5.76956,-4.032069,-0.049398,3.134176,-3.258174
1,Q760329790,1745,20,-2.680529,-5.466671,-3.977088,-1.476769,0.415902,2.088326,-0.741878,...,-0.914641,-2.041675,-2.618623,2.232831,-0.208937,-1.164138,-1.395941,-3.594548,-1.831595,-0.726349
2,Q741313548,2032,21,1.167998,-0.990261,2.861458,2.517082,-0.629097,0.9638,3.996967,...,-5.092326,7.119682,2.863095,-5.55294,1.790051,3.361393,-0.737555,-2.759971,3.144501,-1.107774
3,Q3481466230,2185,15,-1.445572,-1.401717,1.141537,3.502522,-0.491603,-1.247067,1.491883,...,0.938385,-1.170631,0.593306,0.802875,1.326276,0.015383,0.098349,0.474371,-0.240829,-2.68337
4,Q3966197028,2269,17,1.281498,1.846888,-2.340193,2.048831,-0.290757,-0.484098,-3.582294,...,-2.516062,4.341034,2.401198,-3.433695,1.377361,3.901017,0.714511,-0.669386,-1.145614,1.592404


In [10]:

with open('/home/hisense/hzy/pkl/question_feat.pkl','wb') as file:
    pickle.dump(ques,file)

In [18]:

# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
#[回答ID 问题ID 用户ID 回答创建时间 回答内容的单字编码序列 回答内容的切词编码序列 回答是否被标优 回答是否被推荐 
# 回答是否被收入圆桌 是否包含图片 是否包含视频 回答字数 点赞数 取赞数 评论数 收藏数 感谢数 举报数 没有帮助数 反对数]
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

print(ans.shape)

ans = pd.merge(ans, ques, on='qid')
#del ques

# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

print(ans.shape)
ans.head()

(4513735, 19)
(4513735, 86)


Unnamed: 0,aid,qid,uid,is_good,is_rec,is_dest,has_img,has_video,word_count,reci_cheer,...,topic_vector_55,topic_vector_56,topic_vector_57,topic_vector_58,topic_vector_59,topic_vector_60,topic_vector_61,topic_vector_62,topic_vector_63,diff_qa_days
0,A2502060945,Q1867533817,M625498202,0,0,0,0,0,41,1,...,-1.356179,-2.396433,3.064392,-2.633905,-0.282311,-0.075044,2.701669,1.612907,-0.017875,2
1,A247365975,Q1867533817,M279074599,0,0,0,0,0,123,0,...,-1.356179,-2.396433,3.064392,-2.633905,-0.282311,-0.075044,2.701669,1.612907,-0.017875,15
2,A2661909364,Q1867533817,M3094956744,0,0,0,0,0,27,1,...,-1.356179,-2.396433,3.064392,-2.633905,-0.282311,-0.075044,2.701669,1.612907,-0.017875,2
3,A2881302290,Q1867533817,M2926469058,0,0,0,0,0,144,0,...,-1.356179,-2.396433,3.064392,-2.633905,-0.282311,-0.075044,2.701669,1.612907,-0.017875,15
4,A2847829478,Q3366788616,M142330444,0,0,0,0,0,204,1,...,1.817319,3.441553,5.823989,-0.373841,1.58302,-1.654161,1.016138,-3.912368,6.956683,2


In [19]:
with open('/home/hisense/hzy/pre/ans_feat.pkl','wb') as file:
    pickle.dump(ans,file)

In [11]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']
#del user['follow_topic'], user['inter_topic']
#print("user ", user.shape)

unq = user.nunique()
print("user unq ", unq) #返回的是每一列不同值的个数

# 删除取值只有一种的列
for x in unq[unq == 1].index:
    del user[x]
    print('del unq==1 ', x)

print("user ", user.shape)
t = user.dtypes
#print(t) 对于Object类型的进行编码
#cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
#print("user cat ", cats) #user cat  ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
cats = ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    print('encode ', d)
    

user unq  uid              1931654
gender                 3
creat_keyword          1
level                  1
hot                    1
reg_type               1
reg_plat               1
freq                   5
uf_b1                  2
uf_b2                  2
uf_b3                  2
uf_b4                  2
uf_b5                  2
uf_c1               2561
uf_c2                291
uf_c3                428
uf_c4               1556
uf_c5                  2
score                732
follow_topic     1301436
inter_topic      1399721
dtype: int64
del unq==1  creat_keyword
del unq==1  level
del unq==1  hot
del unq==1  reg_type
del unq==1  reg_plat
user  (1931654, 16)
encode  gender
encode  freq
encode  uf_c1
encode  uf_c2
encode  uf_c3
encode  uf_c4
encode  uf_c5


In [12]:
user['follow_topic'] = user['follow_topic'].apply(parse_list_1)

from tqdm import tqdm, tqdm_notebook, _tqdm_notebook, tqdm_pandas

tqdm.pandas(desc="topic2v...")
user['follow_topic_vector']=user['follow_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

def listi(x,i):
    return x[i]
for i in range(64):
    col_name = 'follow_topic_{}'.format(str(i))
    tqdm.pandas(desc="topic_2v...")
    user[col_name]=user['follow_topic_vector'].apply(lambda x:listi(x,i))
        

topic2v...: 100%|██████████| 1931654/1931654 [02:59<00:00, 10767.63it/s]


finished!


In [13]:
def topic_interest2v(x):
    if len(x)==0:
        return np.zeros(64)
    else:
        tmp=np.zeros(64)
        for i in x:
            tmp = tmp + topic_vector_dict[i]*x[i]
        return (tmp / len(x))

def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))  

user['inter_topic'] = user['inter_topic'].apply(parse_map)
    
tqdm.pandas(desc="topic_interest2v...")
user['inter_topic_vector']=user['inter_topic'].progress_apply(lambda x:topic_interest2v(x))

def listi(x,i):
    return x[i]

for i in range(64):
    col_name = 'inter_topic_{}'.format(str(i))
    tqdm.pandas(desc="topic_interest2v...")
    user[col_name]=user['inter_topic_vector'].apply(lambda x:listi(x,i))
    
#del user['follow_topic'], user['inter_topic'], user['follow_topic_vector'], user['inter_topic_vector']
#user.head()

topic_interest2v...: 100%|██████████| 1931654/1931654 [02:13<00:00, 14489.21it/s]


In [14]:

with open('/home/hisense/hzy/pkl/user_feat.pkl','wb') as file:
    pickle.dump(user,file)

In [None]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
# 邀请问题的ID， 被邀请用户ID， 邀请创建时间， 邀请是否被回答
train.columns = ['qid', 'uid', 'dt', 'label']
print(("invite %s", train.shape)) 
train.head()

In [None]:

# 加载验证集
# 邀请的问题ID， 被邀请用户ID， 邀请创建时间

test = pd.read_csv(f'{base_path}/invite_info_evaluate_1_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
print(("test %s", test.shape))

test.head()

In [None]:
sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])
del train['dt'], test['dt']

print(sub_size)
test.head()

In [None]:
# 时间窗口划分
# train
# val
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

In [None]:
train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
print("train_label_feature ", train_label_feature.shape)

val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
print("val_label_feature ", val_label_feature.shape)

train_label = train[(train['day'] > train_label_feature_end)]
print("train_label ", train_label.shape)

print("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

print("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

In [None]:

train_label = pd.merge(train_label, ques, on='qid')
test = pd.merge(test, ques, on='qid')
train_label.head()

In [None]:
# 确定ans的时间范围
# 3807~3874
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

print("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

print("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

In [None]:

# 回答的某些特征
fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']

# train_label：105
# train_label_feature：5
# train_ans_feature：23

def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target


train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

# 特征提取结束
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size


In [None]:

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

data = pd.concat((train_label, test), axis=0, sort=True)
del train_label, test

# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
    # 

# 问题被回答的次数


# 压缩数据
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]
# target编码
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
del data
assert len(test) == sub_size

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)