In [1]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from model_zoo import my_lgb,my_xgb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline

In [2]:
def score(pre, truth):
    return 1 / (MAE(pre, truth) + 1)

def MAE(pre, truth):
    return abs((np.rint(pre) - truth)).mean()

In [3]:
train_df = pd.read_csv('dataset/train_dataset.csv')
test_df = pd.read_csv('dataset/test_dataset.csv')

In [4]:
def generate_feature(df):
    df['用户前五个月平均消费值（元）'] = (df['用户近6个月平均消费值（元）']*6 - df['用户账单当月总费用（元）'])/5
    df['当月消费值较前五个月平均消费值'] = df['用户账单当月总费用（元）'] - df['用户前五个月平均消费值（元）']
    # df['话费稳定性'] = df['用户账单当月总费用（元）'] / (df['用户近6个月平均消费值（元）'] + 1)

    app_col = []
    for col in df.columns:
        if '应用' in col:
            app_col.append(col)
    df['各类应用使用总和'] = df[app_col].sum(axis=1)

    
    df['当月视频播放类应用使用次数'] = np.where(df['当月视频播放类应用使用次数']>30000, 30000, df['当月视频播放类应用使用次数'])
    
    df['当月网购类应用使用次数'] = np.where(df['当月网购类应用使用次数']>10000, 10000, df['当月网购类应用使用次数'])
    
    df['当月金融理财类应用使用总次数'] = np.where(df['当月金融理财类应用使用总次数']>10000, 10000, df['当月金融理财类应用使用总次数'])
    
    df['当月网购类应用使用次数' + '百分比'] = (df['当月网购类应用使用次数']+1)/(df['各类应用使用总和'] + 1)
    
    df['用户当月账户余额（元）'] = np.where(df['用户当月账户余额（元）']>2000, 
                               df['用户当月账户余额（元）']/10, df['用户当月账户余额（元）'])
       
    return df

In [5]:
target = train_df['信用分']
data = pd.concat([train_df.drop(columns=['信用分']), test_df], axis=0, ignore_index=True)

In [6]:
data = generate_feature(data)
train = data.loc[:49999, :]
test = data.loc[50000:, :]

* 处理年龄问题
* social属性
* 三个话费的统计特性

In [7]:
drop_columns = ['用户编码', '是否大学生客户','各类应用使用总和',
                '用户实名制是否通过核实', '当月是否到过福州山姆会员店', 
                '当月是否逛过福州仓山万达']

X_train = train.drop(columns=drop_columns).values
y_train = target.values
X_test = test.drop(columns=drop_columns).values

param = {'num_leaves': 30,
         'objective':'regression',
         'max_depth': 6,
         'learning_rate': 0.008,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.15,
         "lambda_l2": 0.04,
         "verbosity": -1}

param = {'num_leaves': 35,
         'objective':'regression',
         'max_depth': 6,
         'learning_rate': 0.004,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.15,
         "lambda_l2": 0.05,
         "verbosity": -1}
 94

阴差阳错我用的都是mse,只是用了不同的随机种子

In [30]:
param = {'num_leaves': 40,
         'objective':'regression_l1',
         'max_depth': 6,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.12,
         "lambda_l2": 0.05,
         "verbosity": -1}

clf_mae = my_lgb(folds=5, seed=2018)
clf_mae.inference_folds(X_train, y_train, X_test, param)
mae_2018 = clf.oof

fold n°1
Training until validation scores don't improve for 200 rounds.
[200]	training's l1: 20.5838	valid_1's l1: 20.8463
[400]	training's l1: 16.482	valid_1's l1: 16.9506
[600]	training's l1: 15.1124	valid_1's l1: 15.7547
[800]	training's l1: 14.5891	valid_1's l1: 15.3359
[1000]	training's l1: 14.3342	valid_1's l1: 15.1605
[1200]	training's l1: 14.1772	valid_1's l1: 15.0761
[1400]	training's l1: 14.0601	valid_1's l1: 15.0269
[1600]	training's l1: 13.9649	valid_1's l1: 14.994
[1800]	training's l1: 13.8809	valid_1's l1: 14.9705
[2000]	training's l1: 13.8037	valid_1's l1: 14.9542
[2200]	training's l1: 13.7346	valid_1's l1: 14.942
[2400]	training's l1: 13.6697	valid_1's l1: 14.9341
[2600]	training's l1: 13.6074	valid_1's l1: 14.9259
[2800]	training's l1: 13.5512	valid_1's l1: 14.9207
[3000]	training's l1: 13.4987	valid_1's l1: 14.9186
[3200]	training's l1: 13.445	valid_1's l1: 14.9156
[3400]	training's l1: 13.3932	valid_1's l1: 14.9134
[3600]	training's l1: 13.3443	valid_1's l1: 14.9111


[4200]	training's l1: 13.3276	valid_1's l1: 14.406
[4400]	training's l1: 13.282	valid_1's l1: 14.4057
[4600]	training's l1: 13.2402	valid_1's l1: 14.4048
[4800]	training's l1: 13.2011	valid_1's l1: 14.4038
[5000]	training's l1: 13.1613	valid_1's l1: 14.4045
Early stopping, best iteration is:
[4884]	training's l1: 13.1826	valid_1's l1: 14.4035
score: 0.06401426, MAE: 14.62152


In [32]:
# 0.06401426
clf_mae.submit(output_name = 'model_V1_mae_2018')

In [None]:
param = {'num_leaves': 40,
         'objective':'regression_l1',
         'max_depth': 6,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.13,
         "lambda_l2": 0.05,
         "verbosity": -1}

clf_mae_ = my_lgb(folds=5, seed=2019)
clf_mae_.inference_folds(X_train, y_train, X_test, param)
mae_2019 = clf_mae_.oof

fold n°1
Training until validation scores don't improve for 200 rounds.
[200]	training's l1: 20.6431	valid_1's l1: 20.7076
[400]	training's l1: 16.5334	valid_1's l1: 16.6949
[600]	training's l1: 15.172	valid_1's l1: 15.4459
[800]	training's l1: 14.6521	valid_1's l1: 15.0219
[1000]	training's l1: 14.3956	valid_1's l1: 14.8587
[1200]	training's l1: 14.2372	valid_1's l1: 14.7838
[1400]	training's l1: 14.1215	valid_1's l1: 14.7446
[1600]	training's l1: 14.0292	valid_1's l1: 14.7178
[1800]	training's l1: 13.9501	valid_1's l1: 14.7001
[2000]	training's l1: 13.8751	valid_1's l1: 14.6864
[2200]	training's l1: 13.8082	valid_1's l1: 14.6774
[2400]	training's l1: 13.7436	valid_1's l1: 14.6691
[2600]	training's l1: 13.6831	valid_1's l1: 14.6618
[2800]	training's l1: 13.6241	valid_1's l1: 14.6562
[3000]	training's l1: 13.5698	valid_1's l1: 14.6525
[3200]	training's l1: 13.5194	valid_1's l1: 14.6494
[3400]	training's l1: 13.469	valid_1's l1: 14.6455
[3600]	training's l1: 13.422	valid_1's l1: 14.6431

In [None]:
# 0.06401426
clf_mae_.submit(output_name = 'model_V1_mae_2018')

In [None]:
param = {'num_leaves': 30,
         'objective':'regression_l2',
         'max_depth': 6,
         'learning_rate': 0.008,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.15,
         "lambda_l2": 0.05,
         "verbosity": -1}

clf = my_lgb(folds=5, seed=2018)
clf.inference_folds(X_train, y_train, X_test, param)
mse_2018 = clf.oof

In [None]:
residual = pd.Series(mae - y_train)
residual.to_csv('mae_resdiual.csv')

param = {'num_leaves': 35,
         'objective':'regression',
         'max_depth': 6,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mse',
         "lambda_l1": 0.16,
         "lambda_l2": 0.1,
         "verbosity": -1}

In [None]:
param = {'num_leaves': 35,
         'objective':'regression_l2',
         'max_depth': 6,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.16,
         "lambda_l2": 0.1,
         "verbosity": -1}

clf_mse = my_lgb(folds=5, seed=2019)
clf_mse.inference_folds(X_train, y_train, X_test, param)
mse_2019 = clf_mse.oof

In [None]:
# clf_mse.submit(output_name = 'model_27')
# clf.submit(output_name = 'model_31')

In [22]:
def submit(model_name='default', predictions=None):
    sub_df = pd.read_csv('dataset/submit_example.csv')
    sub_df[' score'] = np.rint(predictions).astype(int)
    sub_df.to_csv("output/{}.csv".format(model_name), index=False)

In [None]:
p = 0.50
q = 0.502
results = mse_2018*p+ mse_2019*q
print(score(results, y_train))
res = clf.results*p + clf_mse.results*q
submit(model_name='model_V1', predictions=res)

* model_10:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,num_leaves=40,mae,frac=0.5,l1=0.1线下0.06381835，线上0.06378457000

* model_11:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,num_leaves=37,mae,frac=0.5,l1=0.1线下0.06384775，线上0.06380417000

* model_12:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,近似总消费，num_leaves=39,mae,frac=0.4,l1=0.1，线下：0.06383161，线上0.06376708000

* model_13:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,三个百分比。num_leaves=40,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06384938;

* model_14:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比，丢掉是否大学生用户。num_leaves=33,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06386781，

* model_15:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比，num_leaves=35,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06388707，线上0.06385591000

* model_16:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比,丢掉'当月是否到过福州山姆会员店'，是否大学生用户，num_leaves=35,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06388365,线上0.06384049000

* model_17:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比,丢掉增益小的几个属性。num_leaves=60，max_depth=7,frac=0.5,l1=0.1,l2=0.1,线下0.06391010.线上0.06381682000

* model_18:将余额中超过2000的除以10，其他同model_15.线下0.06389981，线上0.06386208000

* model_19:同18，学习率0.005,0.06389981

* model_20:五折交叉， mae与mse融合

* model_21:0.502, 0.5权重融合

* model_22:单模，mae损失调参后。最终版

* model_23:单模，mse损失调参后。最终版

In [None]:
# drop_columns = drop_columns.remove('信用分')
col = train.drop(columns=drop_columns).columns
f = clf.importance_feature(col)
f

In [19]:
res1 = pd.read_csv('output/model_25.csv')
res2 = pd.read_csv('output/model_V1_mae.csv')

In [23]:
res = 0.501*res1[' score'] + 0.499*res2[' score']
submit(model_name='model_V1_finally', predictions=res)