In [1]:
%load_ext autotime

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
            start_mem - end_mem) / start_mem))
    return df

invite_info = pd.read_hdf('./my_feat/convert_train.h5', key='data')
invite_info_evaluate = pd.read_hdf('./my_feat/convert_test_b.h5', key='data')

print('test b shape = ', invite_info_evaluate.shape)

member_feat = pd.read_hdf('./feats/member_feat.h5', key='data')

invite_info = invite_info.merge(member_feat, 'left', 'author_id')
invite_info_evaluate = invite_info_evaluate.merge(member_feat, 'left', 'author_id')

invite_info_evaluate['label'] = -1

all_invite_data = pd.concat([invite_info,invite_info_evaluate],ignore_index=True)
all_invite_data['anthor_count'] = all_invite_data.groupby('author_id')['author_id'].transform('count')
all_invite_data['question_count'] = all_invite_data.groupby('question_id')['author_id'].transform('count')

invite_info = all_invite_data[all_invite_data['label'] != -1].reset_index(drop=True)
invite_info_evaluate = all_invite_data[all_invite_data['label'] == -1].drop('label',axis=1).reset_index(drop=True)

question_feat = pd.read_hdf('./feats/question_feat.h5', key='data')

member_question_feat = pd.read_hdf('./feats/member_question_feat_final.h5', key='data')

invite_info['author_question_id'] = invite_info['author_id'] + invite_info['question_id']
invite_info_evaluate['author_question_id'] = invite_info_evaluate['author_id'] + invite_info_evaluate['question_id']

train = invite_info.merge(question_feat, 'left', 'question_id')
train = train.merge(member_question_feat, 'left', 'author_question_id')

test b shape =  (1141718, 7)
time: 1min 34s


In [3]:
train_user_current_answer_count =  pd.read_hdf('./my_feat/user_current_answer_count.h5', key='data')

train_user_current_answer_count.head()
train_user_current_answer_count.columns = ['question_id', 'author_id', 'current_answer_count']

train = train.merge(train_user_current_answer_count, how='left', on=['question_id', 'author_id'])

current_invite_count = pd.read_hdf('./my_feat/current_invite_count_b.h5',key='data') # 这里是所有的数据 可以直接和test进行merge

train = train.merge(current_invite_count, how='left', on=['question_id', 'author_id'])

current_invite_q_count = pd.read_hdf('./my_feat/current_invite_q_count_test_b.h5',key='data') # 这里是所有的数据 可以直接和test进行merge

current_invite_q_count.columns=['question_id','author_id','current_invite_q_count']

train = train.merge(current_invite_q_count, how='left', on=['question_id', 'author_id'])

topic_cv_probs_train_df = pd.read_hdf('./my_feat/topic_cv_probs_train_df.h5',key='data')
train = pd.concat([train, topic_cv_probs_train_df],axis=1)

# 读取历史topic交集特征
history_topic_with_current_topic_train = pd.read_hdf('./my_feat/history_topic_with_current_topic_train.h5', key='data')
history_topic_with_current_topic_train = history_topic_with_current_topic_train[['question_id','author_id','intersect1d_topic_nums']]

history_topic_vec_distance = pd.read_hdf('./my_feat/history_topic_vec_distance.h5',key='data')

train = train.merge(history_topic_with_current_topic_train, how='left', on=['question_id', 'author_id'])
train = train.merge(history_topic_vec_distance, how='left', on=['question_id', 'author_id'])

train_time_gap = pd.read_hdf('./my_feat/train_time_gap.h5', key='data')
train = pd.concat([train, train_time_gap], axis=1)

history_topic_vec_cv_train = pd.read_hdf('./my_feat/history_topic_vec_cv_train_1118.h5', key='data')
history_topic_vec_cv_train.columns = ['old_idx', 'history_topic_vec_probs']
train = pd.concat([train, history_topic_vec_cv_train], axis=1)

history_invite_situation_train = pd.read_hdf('./my_feat/current_invite_user_situation_train.h5', key='data')
train = train.merge(history_invite_situation_train, how='left', on=['question_id', 'author_id'])

time: 3min 55s


In [4]:
count_and_unique_feature_train_1124 = pd.read_hdf('./my_feat/count_and_unique_feature_train_final_train.h5', key='data')

train = pd.concat([train, count_and_unique_feature_train_1124], axis=1)

uid_seq_feature_train = pd.read_hdf('./my_feat/uid_seq_feature_train_b.h5', key='data')

train = pd.concat([train, uid_seq_feature_train], axis=1)

topic_convert_df = pd.read_hdf('./my_feat/topic_convert_train_df.h5', key='data')
train = pd.concat([train, topic_convert_df], axis=1)

topic_count_statistics_train = pd.read_hdf('./my_feat/topic_count_statistics_train.h5', key='data')
train = pd.concat([train, topic_count_statistics_train], axis=1)

q_title_kw_convert = pd.read_hdf('./my_feat/q_title_kw_convert.h5', key='data')
train = pd.concat([train, q_title_kw_convert], axis=1)

kw_count_statistics_train = pd.read_hdf('./my_feat/kw_count_statistics_train.h5', key='data')
train = pd.concat([train, kw_count_statistics_train], axis=1)

deepwalk_question = pd.read_pickle('./deepwalk/author_id_question_id_question_id_deepwalk_8_final.pkl')
deepwalk_question = reduce_mem_usage(deepwalk_question)
train = train.merge(deepwalk_question, 'left', 'question_id')

deepwalk_author = pd.read_pickle('./deepwalk/author_id_question_id_author_id_deepwalk_8_final.pkl')
deepwalk_author = reduce_mem_usage(deepwalk_author)
train = train.merge(deepwalk_author, 'left', 'author_id')

time_diff_train = pd.read_hdf('./my_feat/time_diff_train.h5', key='data')
print('time_diff_train shape = ', time_diff_train.shape)

train = pd.concat([train, time_diff_train], axis=1)

Mem. usage decreased to 24.81 Mb (66.7% reduction)
Mem. usage decreased to 32.49 Mb (66.7% reduction)
time_diff_train shape =  (9489162, 2)
time: 3min 43s


In [5]:
drop_feats = ['question_id', 'author_id', 'author_question_id', 'invite_time', 'label', 'invite_day','old_idx'] + ['mlp_topic_probs_new'] + ['q_i_time_gap_day'] + ['uid_seq_feature']

used_feats = [f for f in train.columns if f not in drop_feats]
print(len(used_feats))
print(used_feats)

train_x = train[used_feats].reset_index(drop=True)
train_y = train[['label']].reset_index(drop=True)

from sklearn.model_selection import train_test_split

train_X, val_X, train_Y, val_Y = train_test_split(train_x, train_y , test_size=0.2, random_state=42)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 120,
    'reg_alpha': 0,
    'reg_lambda': 0.,
    'max_depth': -1,
    'subsample': 0.9,
    'colsample_bytree': 0.5,
    'subsample_freq': 1,
    'learning_rate': 0.035,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1,
    'random_state': 42,
    'n_jobs': 16,
    'device':'gpu'
}

import lightgbm as lgb

lgb_train = lgb.Dataset(train_X, train_Y)
lgb_eval = lgb.Dataset(val_X, val_Y, reference=lgb_train)

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=lgb_eval,
                early_stopping_rounds=200,
                verbose_eval=50) 

94
['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'E1', 'E2', 'author_id_convert', 'author_id_label_count', 'freq', 'gender', 'invite_hour', 'max_interest_values', 'mean_interest_values', 'min_interest_values', 'most_interest_topic', 'num_atten_topic', 'num_interest_topic', 'score', 'std_interest_values', 'anthor_count', 'question_count', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'question_hour', 'num_topic_attent_intersection', 'num_topic_interest_intersection', 'min_topic_interest_intersection_values', 'max_topic_interest_intersection_values', 'mean_topic_interest_intersection_values', 'std_topic_interest_intersection_values', 'current_answer_count', 'current_invite_count', 'current_invite_q_count', 'topic_vec_probs', 'intersect1d_topic_nums', 'history_topic_vec_distance', 'time_gap', 'history_topic_vec_probs', 'current_invite_user_convert', 'current_invite_user_success_rate', 'uid_label_count', 'qid_label_count', 'uid_day_label_count', 'qid_day_labe

In [6]:
test = invite_info_evaluate.merge(question_feat, 'left', 'question_id')
test = test.merge(member_question_feat, 'left', 'author_question_id')

test = test.merge(current_invite_count, how='left', on=['question_id', 'author_id'])

print(test.shape)

test_user_current_answer_count =  pd.read_hdf('./my_feat/user_current_answer_count_test_b.h5', key='data')

test = test.merge(test_user_current_answer_count, how='left', on=['question_id', 'author_id'])

test = test.merge(current_invite_q_count, how='left', on=['question_id', 'author_id'])

print(test.shape)

topic_cv_probs_test_df = pd.read_hdf('./my_feat/topic_cv_probs_test_df_b.h5',key='data')
test = pd.concat([test, topic_cv_probs_test_df],axis=1)

# 读取历史topic交集特征
history_topic_with_current_topic_test = pd.read_hdf('./my_feat/history_topic_with_current_topic_test_b.h5', key='data')
history_topic_with_current_topic_test = history_topic_with_current_topic_test[['question_id','author_id','intersect1d_topic_nums']]

test = test.merge(history_topic_with_current_topic_test, how='left', on=['question_id', 'author_id'])

print(test.shape)

history_topic_vec_distance_test = pd.read_hdf('./my_feat/history_topic_vec_distance_test_b.h5',key='data')
test = test.merge(history_topic_vec_distance_test, how='left', on=['question_id', 'author_id'])

test_time_gap = pd.read_hdf('./my_feat/test_time_gap_b.h5', key='data')
test = pd.concat([test, test_time_gap], axis=1)

count_and_unique_feature_test_1124 = pd.read_hdf('./my_feat/count_and_unique_feature_train_final_test.h5', key='data').reset_index(drop=True)
test = pd.concat([test, count_and_unique_feature_test_1124], axis=1)


history_topic_vec_cv_test = pd.read_hdf('./my_feat/history_topic_vec_cv_test_1118_test_b.h5', key='data')
history_topic_vec_cv_test.columns = ['history_topic_vec_probs']
test = pd.concat([test, history_topic_vec_cv_test], axis=1)

history_invite_situation_test = pd.read_hdf('./my_feat/current_invite_user_situation_test_b.h5', key='data')
test = test.merge(history_invite_situation_test, how='left', on=['question_id', 'author_id'])

uid_seq_feature_test = pd.read_hdf('./my_feat/uid_seq_feature_test_b.h5', key='data').reset_index(drop=True)

test = pd.concat([test, uid_seq_feature_test], axis=1)

topic_convert_df_test = pd.read_hdf('./my_feat/topic_convert_test_df_b.h5', key='data')
test = pd.concat([test, topic_convert_df_test], axis=1)

topic_count_statistics_test = pd.read_hdf('./my_feat/topic_count_statistics_test_b.h5', key='data').reset_index(drop=True)
test = pd.concat([test, topic_count_statistics_test], axis=1)


kw_count_statistics_test = pd.read_hdf('./my_feat/kw_count_statistics_test_b.h5', key='data')

test = pd.concat([test, kw_count_statistics_test], axis=1)

q_title_kw_convert_test = pd.read_hdf('./my_feat/q_title_kw_convert_test_b.h5', key='data')

test = pd.concat([test, q_title_kw_convert_test],axis=1)

test = test.merge(deepwalk_question, 'left', 'question_id')

test = test.merge(deepwalk_author, 'left', 'author_id')

time_diff_test_b = pd.read_hdf('./my_feat/time_diff_test_b.h5', key='data')

test = pd.concat([test, time_diff_test_b], axis=1)

(1141718, 43)
(1141718, 45)
(1141718, 47)
time: 57.2 s


In [8]:
test.shape

(1141718, 100)

time: 17.2 ms


In [9]:
test[used_feats].shape

(1141718, 94)

time: 1.09 s


In [10]:
test[used_feats].to_hdf('./my_feat/test_b_feat_sunrui.h5', key='data', index=None)

time: 2.46 s


In [7]:
test_x = test[used_feats].reset_index(drop=True)

test_pred = gbm.predict(test_x, num_iteration=gbm.best_iteration)

result = invite_info_evaluate[['question_id', 'author_id', 'invite_time']]
result['result'] = test_pred

result.to_csv('./submit/lgb_new.txt', sep='\t', index=False, header=False)  # 保留三个seq特征  全部，3和5

time: 34.4 s


In [11]:
gbm.save_model(filename='./model/lgb_sunrui.model', num_iteration=gbm.best_iteration)

<lightgbm.basic.Booster at 0x7fabf0ce7748>

time: 245 ms
