# 模型训练

1. 异常值分析

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # 统计
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [44]:
train_data_file = '../zhengqi_train.txt' # ./代表当前目录
test_data_file = '../zhengqi_test.txt'
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8') # 分隔符为'\t'
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')


In [45]:
# 如V9是异常值，分别将训练集和测试集中的异常值删除
train_data = train_data[train_data['V9'] > -7.5] # 2888行变成2886行 删除了俩异常值
test_data = test_data[test_data['V9'] > -7.5] # 1925行不变

2. 最大值和最小值的归一化

In [46]:
from sklearn import preprocessing 
# feature_columns = [col for col in train_data.columns if col not in ['target']]
feature_columns = train_data.columns.tolist()[:-1]
# 归一化(不对target进行归一化)
min_max_scaler = preprocessing.MinMaxScaler()  # 创建对象
train_data_scaler = min_max_scaler.fit_transform(train_data[feature_columns]) # 只对train集进行fit
test_data_scaler = min_max_scaler.transform(test_data) # 对test集只进行transform
train_data_scaler = pd.DataFrame(train_data_scaler) # ndarray -> df
train_data_scaler.columns = feature_columns # add columns
train_data_scaler['target'] = train_data['target'] # add 'target'

test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = feature_columns

# display(train_data_scaler.describe())
# display(test_data_scaler.describe())

3. PCA处理

In [47]:
# PCA保留16个主成分

from sklearn.decomposition import PCA # 主成分分析法

pca = PCA(n_components=16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,:-1])
new_test_pca_16 = pca.transform(test_data_scaler)

new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)

new_train_pca_16['target'] = train_data_scaler['target']


## 回归模型

1. 线性回归模型

1.1 PCA取特征->分割训练集

In [48]:
"""
new_train_pca_16: PCA处理后只剩下16个特征的训练集 (2886, 17)
new_test_pca_16: PCA处理后只剩下16个特征的测试集  (1925, 16)
train: 即X，没有target (2886, 16)
target: 即y (2886, )
"""

from sklearn.model_selection import train_test_split
# display(new_train_pca_16.info()) # 为何target少2个值？
new_train_pca_16 = new_train_pca_16.fillna(0)  # 缺失值填充为0；采用PCA保留16维特征的数据
train = new_train_pca_16[new_test_pca_16.columns] # train 就是X，没有target (2886, 16)
target = new_train_pca_16['target']  # (2886, )

# display(train_data.shape) # (2886, 39)
# display(test_data.shape)  # (1925, 38)
# 切分训练集 训练数据80%，验证数据20%； 测试集始终没动
train_data, test_data, train_target, test_target = train_test_split(
    train, target, test_size=0.2, random_state=0) # 那原来的train_data和test_data变量怎么办
# display(train_data.shape) # (2308, 17) # 特征改为17个
# display(test_data.shape)  # (578, 17) # 此处test属于原训练集

In [49]:
from sklearn.metrics import mean_squared_error # 评价指标
from sklearn.linear_model import LinearRegression # 导入线性回归模型
clf = LinearRegression()
clf.fit(train_data,train_target)
test_pred = clf.predict(test_data)

score = mean_squared_error(test_target, clf.predict(test_data)) # MSE
print('LinearRegression:  ', score)
print('R2_Score_from_clf:     ', clf.score(test_data, test_target))
print('R2_score_from_myself:  ', cal_r2_score(test_data, test_target, clf.predict(test_data)))

LinearRegression:   0.27169987353423153
R2_Score_from_clf:      0.7350903122742058
R2_score_from_myself:   0.7350903122742058


2. K近邻回归模型

In [50]:
from sklearn.neighbors import KNeighborsRegressor # 回归
clf = KNeighborsRegressor(n_neighbors=3) # 最近的3个
clf.fit(train_data, train_target)
test_pred = clf.predict(test_data)

score = mean_squared_error(test_target, clf.predict(test_data))
print('KNeighborsRegressor:  ', score)
print('R2_Score_from_clf:     ', clf.score(test_data, test_target))
print('R2_score_from_myself:  ', cal_r2_score(test_data, test_target, clf.predict(test_data)))

KNeighborsRegressor:   0.26758239023452524
R2_Score_from_clf:      0.7391048935139868
R2_score_from_myself:   0.7391048935139868


3. 决策树回归模型

In [51]:
from sklearn.tree import DecisionTreeRegressor # 决策树回归算法
clf = DecisionTreeRegressor()
clf.fit(train_data, train_target)
test_pred = clf.predict(test_data)

score = mean_squared_error(test_target, clf.predict(test_data))
print('DecisionTreeRegressor:  ', score) # 这准确率不错啊
print('R2_Score_from_clf:     ', clf.score(test_data, test_target))
print('R2_score_from_myself:  ', cal_r2_score(test_data, test_target, clf.predict(test_data)))

DecisionTreeRegressor:   0.5825566211072664
R2_Score_from_clf:      0.4320023392992992
R2_score_from_myself:   0.4320023392992992


4. 集成学习回归模型

4.1 随机森林回归模型

In [52]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=200) # 200棵树模型
clf.fit(train_data, train_target)
test_pred = clf.predict(test_data)

score = mean_squared_error(test_target, clf.predict(test_data))
print('RandomForestRegressor:  ', score) # 准确率就这？？
print('R2_Score_from_clf:     ', clf.score(test_data, test_target))
print('R2_score_from_myself:  ', cal_r2_score(test_data, test_target, clf.predict(test_data)))

RandomForestRegressor:   0.25260074332651383
R2_Score_from_clf:      0.7537121266804732
R2_score_from_myself:   0.7537121266804732


4.2 LGB回归模型

In [55]:
import lightgbm as lgb                              # LightGBM模型
clf = lgb.LGBMRegressor(
    learning_rate=0.01,
    max_depth=-1,
    n_estimators=5000,
    boosting_type='gbdt',
    random_state=2019,
    objective='regression'
)

clf.fit(X=train_data, y=train_target, eval_metric='MSE', verbose=50)
score = mean_squared_error(test_target, clf.predict(test_data))
print('lightGbm:  ', score)
print('R2_Score_from_clf:     ', clf.score(test_data, test_target))
print('R2_score_from_myself:  ', cal_r2_score(test_data, test_target, clf.predict(test_data)))

lightGbm:   0.2464496224326475
R2_Score_from_clf:      0.7597095218723132
R2_score_from_myself:   0.7597095218723132
