# Step 1: 导入函数工具箱

In [None]:
# 基础工具
import os
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from IPython.display import display

warnings.filterwarnings('ignore')
%matplotlib inline

import lightgbm as lgb
import xgboost as xgb

# 参数搜索和评价的
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import mean_absolute_error

# 设置pandas显示选项，显示所有列
pd.set_option('display.max_columns', None)     # 显示所有列，不省略
pd.set_option('display.width', None)           # 自动调整显示宽度
pd.set_option('display.max_colwidth', 50)      # 每列内容最多显示50个字符

def get_project_path(*paths):
    """获取项目路径的统一方法"""
    try:
        return os.path.join(os.path.dirname(__file__), *paths)
    except NameError:
        return os.path.join(os.getcwd(), *paths)

# Step 2: 数据读取

In [None]:
# 通过pandas读取数据（Pandas 是一个很友好的数据读取函数库）
Train_data = pd.read_csv(get_project_path('data', 'used_car_train_20200313.csv'), sep=' ')
Test_data = pd.read_csv(get_project_path('data', 'used_car_testB_20200421.csv'), sep=' ')

# 输出数据的大小信息
print('Train data shape:', Train_data.shape)
print('Test data shape:', Test_data.shape)

## 1) 数据简要浏览

In [None]:
# 通过 .head() 简要浏览读取数据的形式
Train_data.head()

## 2) 数据信息查看

In [None]:
# 通过 .info() 简要可以看到对应的数据列名，以及NaN缺失信息
Train_data.info()

In [None]:
# 通过 .columns 查看列名
Train_data.columns

In [None]:
Test_data.info()

## 3) 数据统计信息浏览

In [None]:
## 通过 .describe() 可以查看数值特征列的一些统计信息
Train_data.describe()

In [None]:
Test_data.describe()

# Step 3: 特征与标签构建

## 1) 提取数值类型特征列名

In [None]:
numerical_cols = Train_data.select_dtypes(exclude='object').columns
print(numerical_cols)

In [None]:
categorical_cols = Train_data.select_dtypes(include='object').columns
print(categorical_cols)

## 2) 构建训练和测试样本

In [None]:
# 选择特征列
feature_cols = [col for col in Train_data.columns if col not in ['SaleID', 'name', 'regDate', 'creatDate', 'price', 'model', 'brand', 'regionCode', 'seller']]
feature_cols = [col for col in feature_cols if 'Type' not in col]

# 提前特征列，标签列构造训练样本和测试样本
X_data = Train_data[feature_cols]
Y_data = Train_data['price']

X_test = Test_data[feature_cols]

print('X train shape:', X_data.shape)
print('X test shape:', X_test.shape)

In [None]:
# 定义一个统计函数，方便后续信息统计
def Sta_inf(data):
    print('_min', np.min(data))
    print('_max', np.max(data))
    print('_mean', np.mean(data))
    print('_ptp', np.ptp(data))
    print('_std', np.std(data))
    print('_var', np.var(data))

## 3) 统计标签的基本分布信息

In [None]:
print('Sta of label:')
Sta_inf(Y_data)

In [None]:
# 绘制标签的统计图，查看标签分布
plt.hist(Y_data)
plt.show()
plt.close()

## 4) 缺省值用 -1 填补

In [None]:
X_data = X_data.fillna(-1)
X_test = X_test.fillna(-1)

# Step 4: 模型训练与预测

## 1) 利用 XGBoost 进行五折交叉验证查看模型的参数效果

In [None]:
# XGBoost Model
xgr = xgb.XGBRegressor(
    n_estimators=120,
    learning_rate=0.1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.9,
    max_depth=7
)

scores_train = []
scores = []

# 5折交叉验证方式
sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_ind, val_ind in sk.split(X_data, Y_data):
    train_x = X_data.iloc[train_ind].values
    train_y = Y_data.iloc[train_ind]
    val_x = X_data.iloc[val_ind].values
    val_y = Y_data.iloc[val_ind]

    xgr.fit(train_x, train_y)
    pred_train_xgb = xgr.predict(train_x)
    pred_xgb = xgr.predict(val_x)

    score_train = mean_absolute_error(train_y, pred_train_xgb)
    scores_train.append(score_train)
    score = mean_absolute_error(val_y, pred_xgb)
    scores.append(score)

print('Train mae:', np.mean(scores_train))
print('Val mae:', np.mean(scores))

## 3) 切分数据集 (Train, Val) 进行模型训练，评价和预测

## 2) 定义 xgb 和 lgb 模型函数

In [None]:
def build_model_xgb(x_train, y_train):
    model = xgb.XGBRegressor(
        n_estimators=150,
        learning_rate=0.1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.9,
        max_depth=7
    )
    model.fit(x_train, y_train)

    return model

def build_model_lgb(x_train, y_train):
    estimator = lgb.LGBMRegressor(num_leaves=127, n_estimators=150)
    param_grid = { 'learning_rate': [0.01, 0.05, 0.1, 0.2] }
    gbm = GridSearchCV(estimator, param_grid)
    gbm.fit(x_train, y_train)

    return gbm

## 3) 切分数据集 (Train, Val) 进行模型训练，评价和预测

In [None]:
## Split data with val
x_train, x_val, y_train, y_val = train_test_split(X_data, Y_data, test_size=0.3)

In [None]:
# Split data with val
print('Train lgb...')
model_lgb = build_model_lgb(x_train, y_train)
val_lgb = model_lgb.predict(val_x)
MAE_lgb = mean_absolute_error(val_y, val_lgb)
print('MAE of val with lgb:', MAE_lgb)

print('Predict lgb...')
model_lgb_pre = build_model_lgb(X_data, Y_data)
subB_lgb = model_lgb_pre.predict(X_test)
print('Sta of Predict lgb:')
Sta_inf(subB_lgb)

In [None]:
print('Train xgb...')
model_xgb = build_model_xgb(x_train, y_train)
val_xgb = model_xgb.predict(val_x)
MAE_xgb = mean_absolute_error(val_y, val_xgb)
print('MAE of val with xgb:', MAE_xgb)

print('Predict xgb...')
model_xgb_pre = build_model_xgb(X_data, Y_data)
subB_xgb = model_xgb_pre.predict(X_test)
print('Sta of Predict xgb:')
Sta_inf(subB_xgb)

## 4) 进行两模型的结果加权融合

In [None]:
# 这里我们采用了简单的加权融合的方式
val_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * val_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * val_xgb
val_Weighted[val_Weighted < 0] = 10 # 由于我们发现预测的最小值有负数，而真实情况下，price 为负是不存在的，由此我们进行对应的后修正
print('MAE of val with Weighted ensemble:', mean_absolute_error(val_y, val_Weighted))

In [None]:
sub_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * subB_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * subB_xgb

# 查看预测值的统计进行
plt.hist(Y_data)
plt.show()
plt.close()

## 5) 输出结果

In [None]:
sub = pd.DataFrame()
sub['SaleID'] = Test_data.SaleID
sub['price'] = sub_Weighted
sub.to_csv(get_project_path('prediction_result', 'submission.csv'), index=False)

In [None]:
sub.head()