In [8]:
from matplotlib import pyplot as plt
import numpy as np
import scipy as sp
import sklearn as sk
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# 1 读入数据

In [9]:
train_data_path=r'./pre_train.txt'
test_data_path=r'./pre_test.txt'

In [10]:
common_features = ['gender','age','tagid','time','province','city','model','make']
trained_features=['pid','label'] + common_features
tested_features=['pid'] + common_features

In [11]:
train_data=pd.read_csv(train_data_path, sep=',', header=None, names=trained_features)

In [12]:
test_data=pd.read_csv(test_data_path,sep=',',header=None, names=tested_features)

In [13]:
data=pd.concat([train_data,test_data],axis=0)

In [14]:
for col in [x for x in data.columns if x not in ['label']]:
    data[col] = data[col].fillna(-1)
    data[col] = data[col].astype('str')

In [15]:
data['tagid'] = data['tagid'].apply(lambda x: eval(x)) # str -> list

In [16]:
data['time'] = data['time'].apply(lambda x: eval(x)) # str -> list

In [17]:
# def make_rm_model(x):
#     a = str(x[0]).strip()
#     b = str(x[1]).strip()
#     if b.__contains__(a):
#         b = b.replace(a, '')
#     return b

In [18]:
# data['make'] = data[['model', 'make']].apply(make_rm_model, axis=1)

# 2 特征处理

In [19]:
used_features = []
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for col in ['gender', 'age', 'province', 'city', 'model', 'make']:
    data['{}_category'.format(col)] = le.fit_transform(data[col])
    used_features.append('{}_category'.format(col))

In [37]:
from gensim.models import Word2Vec
import warnings

warnings.filterwarnings('ignore')

emb_size = 16

sentences = data['tagid'].values.tolist()
for i in range(len(sentences)):
    sentences[i] = [str(x) for x in sentences[i]]


In [38]:
type(sentences[0][1])

str

In [39]:
model = Word2Vec(sentences, size=emb_size, window=5, min_count=5, sg=0, hs=1, seed=42)
emb_matrix = []
for seq in sentences:
    vec = []
    for w in seq:
        if w in model.wv:
            vec.append(model.wv.get_vector(w))
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))# 取求和平均的embedding？？为啥要这样
    else:
        emb_matrix.append([0] * emb_size)
emb_matrix = np.array(emb_matrix)

for i in range(emb_size):
    data['{}_emb_{}'.format('tagid', i)] = emb_matrix[:, i]
    used_features.append('{}_emb_{}'.format('tagid', i))
del model, sentences

In [None]:
# data = data.drop(['tagid','time'],axis=1)

In [40]:
train=data[:train_data.shape[0]]
test=data[train_data.shape[0]:]

In [44]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_res = np.zeros(shape=(train.shape[0]))
test_res = np.zeros(shape=(test.shape[0]))

In [42]:
data.columns

Index(['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city',
       'model', 'make', 'gender_category', 'age_category', 'province_category',
       'city_category', 'model_category', 'make_category', 'tagid_emb_0',
       'tagid_emb_1', 'tagid_emb_2', 'tagid_emb_3', 'tagid_emb_4',
       'tagid_emb_5', 'tagid_emb_6', 'tagid_emb_7', 'tagid_emb_8',
       'tagid_emb_9', 'tagid_emb_10', 'tagid_emb_11', 'tagid_emb_12',
       'tagid_emb_13', 'tagid_emb_14', 'tagid_emb_15'],
      dtype='object')

# 3 模型：lightgbm

In [31]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': -1,
    'num_leaves': 31,
    'learning_rate': 0.1,
    'verbose': 0,
    'random_state': 42,
    'n_jobs': -1,
}
imp_Df = pd.DataFrame()
imp_Df['feature'] = used_features

In [33]:
for index, (train_index, valid_index) in enumerate(skf.split(train, train['label'])):
    
    X_train, X_valid = train.iloc[train_index][used_features].values, train.iloc[valid_index][used_features].values
    y_train, y_valid = train.iloc[train_index]['label'], train.iloc[valid_index]['label']
    #print(index)
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_valid, label=y_valid)
    lgb_model = lgb.train(
        params,
        dtrain,
        num_boost_round=10000,
        valid_sets=[dval],
        early_stopping_rounds=50,
        verbose_eval=50,
    )
    X_valid_pred = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)
    imp_Df['cv'+str(index)] = lgb_model.feature_importance()

    train_res[valid_index] = X_valid_pred
    test_res = test_res + lgb_model.predict(test[used_features].values,
                                            num_iteration=lgb_model.best_iteration) / skf.n_splits

0
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.770295
[100]	valid_0's auc: 0.782066
[150]	valid_0's auc: 0.784925
[200]	valid_0's auc: 0.786333
[250]	valid_0's auc: 0.787371
[300]	valid_0's auc: 0.788072
[350]	valid_0's auc: 0.788799
[400]	valid_0's auc: 0.789186
[450]	valid_0's auc: 0.789686
[500]	valid_0's auc: 0.789968
[550]	valid_0's auc: 0.78997
Early stopping, best iteration is:
[501]	valid_0's auc: 0.79001
1
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.769544
[100]	valid_0's auc: 0.7823
[150]	valid_0's auc: 0.786334
[200]	valid_0's auc: 0.787963
[250]	valid_0's auc: 0.788763
[300]	valid_0's auc: 0.789336
[350]	valid_0's auc: 0.789818
[400]	valid_0's auc: 0.790315
[450]	valid_0's auc: 0.790574
[500]	valid_0's auc: 0.790721
[550]	valid_0's auc: 0.790876
[600]	valid_0's auc: 0.791041
[650]	val

In [37]:
train['predict'] = train_res
train['rank'] = train['predict'].rank()
train['p'] = 1
train.loc[train['rank'] <= train.shape[0] * 0.5, 'p'] = 0
best_f1_train = f1_score(train['label'].values, train['p'].values)
print(best_f1_train)

0.7115933333333333


In [39]:
submit = test[['pid']]
submit['rank'] = test_res
submit.columns = ['user_id', 'rank']

submit['rank'] = submit['rank'].rank()
submit['category_id'] = 1
submit.loc[submit['rank'] <= int(submit.shape[0] * 0.5), 'category_id'] = 0

submit[['user_id', 'category_id']].to_csv('f1_{}.csv'.format(str(best_f1_train).split('.')[1]), index=False)