In [1]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from model_zoo import my_lgb,my_xgb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline

In [2]:
train_df = pd.read_csv('dataset/train_dataset.csv')
test_df = pd.read_csv('dataset/test_dataset.csv')

In [3]:
def generate_feature(df):
    df['用户前五个月平均消费值（元）'] = (df['用户近6个月平均消费值（元）']*6 - df['用户账单当月总费用（元）'])/5
    
    df['当月消费值较前五个月平均消费值'] = df['用户账单当月总费用（元）'] - df['用户前五个月平均消费值（元）']
    
    app_col = []
    for col in df.columns:
        if '应用' in col:
            app_col.append(col)
    df['各类应用使用总和'] = df[app_col].sum(axis=1)
    
    # count 特征
    df['用户近6个月平均消费值（元）'] = np.rint(df['用户近6个月平均消费值（元）'])
    feature = ['用户网龄（月）','用户近6个月平均消费值（元）']
    for f in feature:
        new_feature = f + '_count'
        temp = df.groupby(f).size().reset_index().rename(columns={0: new_feature})
        df = df.merge(temp, 'left', on=f)

    # df['人均消费'] = df['用户账单当月总费用（元）']/(df['当月通话交往圈人数'] + 1)
    
    # df['交通消费'] = df['当月火车类应用使用次数'] + df['当月飞机类应用使用次数']
    
    df['当月视频播放类应用使用次数'] = np.where(df['当月视频播放类应用使用次数']>30000, 30000, df['当月视频播放类应用使用次数'])
    
    df['当月网购类应用使用次数'] = np.where(df['当月网购类应用使用次数']>10000, 10000, df['当月网购类应用使用次数'])
    
    df['当月金融理财类应用使用总次数'] = np.where(df['当月金融理财类应用使用总次数']>10000, 10000, df['当月金融理财类应用使用总次数'])
    
    df['当月网购类应用使用次数' + '百分比'] = df['当月网购类应用使用次数']/(df['各类应用使用总和'] + 1)
    
    df['用户当月账户余额（元）'] = np.where(df['用户当月账户余额（元）']>2000, 
                                df['用户当月账户余额（元）']/10, df['用户当月账户余额（元）'])
       
    return df

In [4]:
def score(pre, truth):
    return 1 / (MAE(pre, truth) + 1)

def MAE(pre, truth):
    return abs((np.rint(pre) - truth)).mean()

In [5]:
train_df = generate_feature(train_df)
test_df = generate_feature(test_df)

In [6]:
drop_columns = ['用户编码','信用分', '是否大学生客户','各类应用使用总和',
                '用户实名制是否通过核实', '当月是否到过福州山姆会员店', 
                '当月是否逛过福州仓山万达']

X_train = train_df.drop(columns=drop_columns).values
y_train = train_df['信用分'].values
drop_columns.remove('信用分')
X_test = test_df.drop(columns=drop_columns).values

In [25]:
y_train, bins = pd.cut(y_train, 30, retbins=True, labels=False)

In [50]:
param = {'num_leaves': 150,
         'objective':'multiclass',
         "num_class":np.unique(y_train).size,
         'max_depth': -1,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'multi_logloss',
         "lambda_l1": 0.1,
         "lambda_l2": 0.1,
         "verbosity": -1}

In [51]:
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof = np.zeros(X_train.shape[0])
predictions = np.zeros(X_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=200,
                    early_stopping_rounds=100)
    oof[val_idx] = np.argmax(clf.predict(X_train[val_idx], num_iteration=clf.best_iteration), axis=1) * 9.9 + 421.7

    predictions += (np.argmax(clf.predict(X_test, num_iteration=clf.best_iteration), axis=1) * 9.9 + 421.7) / folds.n_splits


results = np.rint(predictions).astype('int64')

print("score: {:.8f}, MAE: {}".format(score(oof, train_df['信用分']), MAE(oof, train_df['信用分'])))

fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's multi_logloss: 1.77099	valid_1's multi_logloss: 2.32413
[400]	training's multi_logloss: 1.36165	valid_1's multi_logloss: 2.21136
[600]	training's multi_logloss: 1.10467	valid_1's multi_logloss: 2.17589
[800]	training's multi_logloss: 0.919312	valid_1's multi_logloss: 2.1692
Early stopping, best iteration is:
[792]	training's multi_logloss: 0.9258	valid_1's multi_logloss: 2.16917
fold n°2
Training until validation scores don't improve for 100 rounds.
[200]	training's multi_logloss: 1.7742	valid_1's multi_logloss: 2.32261
[400]	training's multi_logloss: 1.36513	valid_1's multi_logloss: 2.20433
[600]	training's multi_logloss: 1.10786	valid_1's multi_logloss: 2.16505
[800]	training's multi_logloss: 0.922256	valid_1's multi_logloss: 2.15594
Early stopping, best iteration is:
[826]	training's multi_logloss: 0.901686	valid_1's multi_logloss: 2.15562
fold n°3
Training until validation scores don't improve 

In [47]:
print("score: {:.8f}, MAE: {}".format(score(oof, train_df['信用分']), MAE(oof, train_df['信用分'])))

score: 0.05731040, MAE: 16.44884
