In [1]:
# import lightgbm as lgb
from model_zoo import my_lgb,my_xgb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
train_df = pd.read_csv('dataset/train_dataset.csv')
test_df = pd.read_csv('dataset/test_dataset.csv')

In [46]:
# 错误数据处理：'缴费用户最近一次缴费金额（元）', '用户最近一次缴费距今时长（月）'
def fix1(df):
    df['用户最近一次缴费距今时长（月）'] = np.where(df['缴费用户最近一次缴费金额（元）'] > 0, 1, 0)
    return df

def age_bins(x):
    if x > 60:
        return 5
    elif x > 50:
        return 4
    elif x > 40:
        return 3
    elif x > 30:
        return 2
    elif x > 20:
        return 1
    else:
        return 0
    
def bins(df):
    df['用户年龄段'] = df['用户年龄'].apply(age_bins)
    #df['用户近六个月消费等级划分'] = pd.qcut(df['用户近6个月平均消费值（元）'], q=10, labels=False)
    #df['用户网龄等级划分'] = pd.qcut(df['用户网龄（月）'], q=10, labels=False)
    df = pd.get_dummies(df, columns=['用户年龄段'])
    #df = pd.get_dummies(df, columns=['用户网龄等级划分'])
    #df = pd.get_dummies(df, columns=['用户近六个月消费等级划分'])
    return df

def shopping_encoder(item):
    is_shopping = item['是否经常逛商场的人']
    avg_shopping_num = item['近三个月月均商场出现次数']

    if is_shopping == 0:
        if avg_shopping_num < 10:
            return 0
        elif avg_shopping_num < 20:
            return 1
        else:
            return 2
    else:
        if avg_shopping_num < 20:
            return 3
        else:
            return 4

In [73]:
def feature_extract(df):
    df['话费稳定性'] = df['用户账单当月总费用（元）'] / (df['用户近6个月平均消费值（元）'] + 1)
    df['话费波动'] = df['用户账单当月总费用（元）'] - df['用户近6个月平均消费值（元）']
    df['用户余额与当月话费'] = df['用户账单当月总费用（元）'] / (df['用户当月账户余额（元）'] + 1)

    # count 特征
    
    df['用户近6个月平均消费值（元）'] = np.rint(df['用户近6个月平均消费值（元）'])
    feature = ['用户网龄（月）','用户近6个月平均消费值（元）']
    
    for f in feature:
        new_feature = f + '_count'
        temp = df.groupby(f).size().reset_index().rename(columns={0: new_feature})
        df = df.merge(temp, 'left', on=f)
    
    df['近似总消费'] = df['用户近6个月平均消费值（元）']*np.where(df['用户网龄（月）'] >= 6, 6, df['用户网龄（月）'])
    # df['交通消费'] = df['当月火车类应用使用次数'] + df['当月飞机类应用使用次数']
    
    app_col = []
    for col in df.columns:
        if '应用' in col:
            app_col.append(col)
            
    df['各类应用使用总和'] = df[app_col].sum(axis=1)
    # df['当月网购类应用使用次数' + '百分比'] = (df['当月网购类应用使用次数'])/(df['各类应用使用总和'] + 5)
    df['当月视频播放类应用使用次数'] = np.where(df['当月视频播放类应用使用次数']>30000, 30000, df['当月视频播放类应用使用次数'])
    
    df['当月网购类应用使用次数'] = np.where(df['当月网购类应用使用次数']>10000, 10000, df['当月网购类应用使用次数'])
    
    df['当月金融理财类应用使用总次数'] = np.where(df['当月金融理财类应用使用总次数']>10000, 10000, df['当月金融理财类应用使用总次数'])
    
    df['当月网购类应用使用次数' + '百分比'] = (df['当月网购类应用使用次数']+1)/(df['各类应用使用总和'] + 1)
    
    df['用户当月账户余额（元）'] = np.where(df['用户当月账户余额（元）']>2000, 
                               df['用户当月账户余额（元）']/10, df['用户当月账户余额（元）'])
    # df['各类应用使用总和'] = df[app_col].sum(axis=1)
    # df['商场编码'] = df[['是否经常逛商场的人', '近三个月月均商场出现次数']].apply(shopping_encoder, axis=1)
    # df = pd.get_dummies(df, columns=['商场编码'])
    
    return df

In [74]:
def process(df):
    df = fix1(df)
    #df = bins(df)
    df = feature_extract(df)
    return df

In [75]:
target = train_df['信用分']
data = pd.concat([train_df.drop(columns=['信用分']), test_df], axis=0, ignore_index=True)
data = process(data)

In [76]:
train = data.loc[:49999, :]
test = data.loc[50000:, :]

In [77]:
drop_columns = ['用户编码','各类应用使用总和', '是否大学生客户',
                '用户实名制是否通过核实', '当月是否到过福州山姆会员店', 
                '当月是否逛过福州仓山万达']

X_train = train.drop(columns=drop_columns).values
y_train = target.values
X_test = test.drop(columns=drop_columns).values

In [78]:
param = {'num_leaves': 40,
         'objective':'regression',
         'max_depth': 6,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.5,
         "bagging_freq": 1,
         "bagging_fraction": 0.5,
         "metric": 'mae',
         "lambda_l1": 0.22,
         "lambda_l2": 0.03,
         "verbosity": -1}

clf = my_lgb(folds=5, seed=2018)
clf.inference_folds(X_train, y_train, X_test, param)
mae = clf.oof

fold n°1
Training until validation scores don't improve for 200 rounds.
[200]	training's l1: 20.2654	valid_1's l1: 20.5059
[400]	training's l1: 16.1726	valid_1's l1: 16.6172
[600]	training's l1: 14.9476	valid_1's l1: 15.5623
[800]	training's l1: 14.5147	valid_1's l1: 15.2417
[1000]	training's l1: 14.301	valid_1's l1: 15.1241
[1200]	training's l1: 14.1531	valid_1's l1: 15.0672
[1400]	training's l1: 14.0323	valid_1's l1: 15.0311
[1600]	training's l1: 13.9273	valid_1's l1: 15.0092
[1800]	training's l1: 13.8299	valid_1's l1: 14.9937
[2000]	training's l1: 13.7393	valid_1's l1: 14.982
[2200]	training's l1: 13.6524	valid_1's l1: 14.9718
[2400]	training's l1: 13.5698	valid_1's l1: 14.9652
[2600]	training's l1: 13.4897	valid_1's l1: 14.9596
[2800]	training's l1: 13.4126	valid_1's l1: 14.9559
[3000]	training's l1: 13.3384	valid_1's l1: 14.9525
[3200]	training's l1: 13.2633	valid_1's l1: 14.9493
[3400]	training's l1: 13.1906	valid_1's l1: 14.9494
Early stopping, best iteration is:
[3277]	training

In [59]:
col = train.drop(columns=drop_columns).columns
f = clf.importance_feature(col)
f

Unnamed: 0,score
用户网龄（月）,10663
用户年龄,9903
当月通话交往圈人数,9461
话费稳定性,9069
话费波动,8489
用户近6个月平均消费值（元）,8137
用户网龄（月）_count,8056
用户账单当月总费用（元）,7624
用户余额与当月话费,7402
当月视频播放类应用使用次数,6925
