In [1]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from model_zoo import my_lgb,my_xgb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [30]:
train_df = pd.read_csv('dataset/train_dataset.csv')
test_df = pd.read_csv('dataset/test_dataset.csv')

In [31]:
def generate_feature(df):
    df['用户前五个月平均消费值（元）'] = (df['用户近6个月平均消费值（元）']*6 - df['用户账单当月总费用（元）'])/5
    
    df['当月消费值较前五个月平均消费值'] = df['用户账单当月总费用（元）'] - df['用户前五个月平均消费值（元）']
    
    app_col = []
    for col in df.columns:
        if '应用' in col:
            app_col.append(col)
    df['各类应用使用总和'] = df[app_col].sum(axis=1)
    
    df['当月网购类应用使用次数' + '百分比'] = df['当月网购类应用使用次数']/(df['各类应用使用总和'] + 1)
    
    # df['当月旅游资讯类应用使用次数' + '百分比'] = df['当月旅游资讯类应用使用次数']/(df['各类应用使用总和'] + 1)
    
    return df

In [32]:
train_df = generate_feature(train_df)
test_df = generate_feature(test_df)

In [None]:
# 特征无用
# train_df['人平均消费'] = train_df['用户账单当月总费用（元）']/train_df['当月通话交往圈人数']

# test_df['人平均消费'] = test_df['用户账单当月总费用（元）']/test_df['当月通话交往圈人数']

In [None]:
# 用户开始上网的年龄

# train_df['开始上网的年龄'] = train_df['用户年龄']/12 - train_df['用户网龄（月）']

# test_df['开始上网的年龄'] = test_df['用户年龄']/12 - test_df['用户网龄（月）']

In [None]:
# 无提升作用
# train_df['近似总消费'] = train_df['用户近6个月平均消费值（元）']*train_df['用户网龄（月）']/6

# test_df['近似总消费'] = test_df['用户近6个月平均消费值（元）']*test_df['用户网龄（月）']/6

In [None]:
# 特征无用，无提升效果
# train_df['六个月平均消费值较五个月平均消费值'] = train_df['用户近6个月平均消费值（元）'] - train_df['用户前五个月平均消费值（元）']

# test_df['六个月平均消费值较五个月平均消费值'] = test_df['用户近6个月平均消费值（元）'] - test_df['用户前五个月平均消费值（元）']

In [53]:
drop_columns = ['用户编码', '信用分', '是否大学生客户','用户实名制是否通过核实']
# ，,,,,,'各类应用使用总和'
#                '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店'
X_train = train_df.drop(columns=drop_columns).values
y_train = train_df['信用分'].values
drop_columns.remove('信用分')
X_test = test_df.drop(columns=drop_columns).values

param = {'num_leaves': 60,
         'objective':'regression',
         'max_depth': 6,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "lambda_l2": 0.1,
         "verbosity": -1}

clf = my_lgb(folds=5, seed=2018)
clf.inference_folds(X_train, y_train, X_test, param)

fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's l1: 20.4102	valid_1's l1: 20.6825
[400]	training's l1: 16.1023	valid_1's l1: 16.6468
[600]	training's l1: 14.8329	valid_1's l1: 15.5509
[800]	training's l1: 14.3925	valid_1's l1: 15.2177
[1000]	training's l1: 14.1851	valid_1's l1: 15.0911
[1200]	training's l1: 14.0491	valid_1's l1: 15.0303
[1400]	training's l1: 13.9467	valid_1's l1: 15.0003
[1600]	training's l1: 13.8581	valid_1's l1: 14.9845
[1800]	training's l1: 13.7828	valid_1's l1: 14.9706
[2000]	training's l1: 13.713	valid_1's l1: 14.9625
[2200]	training's l1: 13.6451	valid_1's l1: 14.9551
[2400]	training's l1: 13.5791	valid_1's l1: 14.9488
[2600]	training's l1: 13.5177	valid_1's l1: 14.9456
[2800]	training's l1: 13.4561	valid_1's l1: 14.9425
[3000]	training's l1: 13.3997	valid_1's l1: 14.9413
Early stopping, best iteration is:
[2919]	training's l1: 13.4223	valid_1's l1: 14.9405
fold n°2
Training until validation scores don't improve for 100 ro

* model_10:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,num_leaves=40,mae,frac=0.5,l1=0.1线下0.06381835，线上0.06378457000

* model_11:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,num_leaves=37,mae,frac=0.5,l1=0.1线下0.06384775，线上0.06380417000

* model_12:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,近似总消费，num_leaves=39,mae,frac=0.4,l1=0.1，线下：0.06383161，线上0.06376708000

* model_13:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,三个百分比。num_leaves=40,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06384938;

* model_14:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比，丢掉是否大学生用户。num_leaves=33,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06386781，

* model_15:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比，num_leaves=35,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06388707，

* model_16:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比,丢掉'当月是否到过福州山姆会员店'，是否大学生用户，num_leaves=35,max_depth=6,frac=0.5,l1=0.1,l2=0.1,线下0.06388365，

* model_17:用户前五个月平均消费值（元）,六个月平均消费值较五个月平均消费值,网购次数百分比,丢掉增益小的几个属性。num_leaves=60，max_depth=7,frac=0.5,l1=0.1,l2=0.1,线下0.06391010.

In [47]:
clf.submit(output_name='model_17')

In [None]:
drop_columns.append('信用分')
f = clf.importance_feature(train_df.drop(columns=drop_columns).columns)
f