In [1]:
"""
作者：librauee
微信公众号：老肥码码码
日期：2020.12.18
线上得分：3.156927
截至日期排名：2
"""
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
import warnings
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from pypinyin import lazy_pinyin

warnings.filterwarnings('ignore')

In [2]:
# 数据读取、合并
train = pd.read_csv('train.csv')
test = pd.read_csv('test_noLabel.csv')
y = train['Label']
data = pd.concat([train, test], axis=0)

In [3]:
# 特征工程 方位特征、房屋数量、面积等特征构造
def location(x):
    loc = x.split()[0]
    loc_dic = {'东': 1, '南': 2, '西': 3, '北': 4, '东南': 5, '西南': 6, '东北': 7, '西北': 8 }
    return loc_dic[loc]

data['房屋朝向'] = data['房屋朝向'].apply(location)
data['小区地铁数量'] = data.groupby('小区名')['地铁站点'].transform('nunique')
data['小区地铁线路数量'] = data.groupby('小区名')['地铁线路'].transform('nunique')
data['总房间数'] = data['卧室数量'] + data['厅的数量'] + data['卫的数量']
data['房间平均面积'] = data['房屋面积'] / data['总房间数']
data['卧室面积'] = data['房屋面积'] * (data['卧室数量'] / data['总房间数'])
data['厅的面积'] = data['房屋面积'] * (data['厅的数量'] / data['总房间数'])
data['卫的面积'] = data['房屋面积'] * (data['卫的数量'] / data['总房间数'])
data['除卧室外面积'] = data['房屋面积'] - data['卧室面积']
data['相对高度'] = data['楼层'] / (data['总楼层'] + 1)

In [4]:
# 将类别特征转换为category类型
# 去除无关列
# 中文列名拼音转换
X_train, X_test = data[~data['Label'].isna()], data[data['Label'].isna()]
X_train = X_train.drop(['ID', 'Label'], axis=1)
X_test = X_test.drop(['ID'], axis=1)
X_train.columns = [''.join(lazy_pinyin(i)) for i in X_train.columns]
X_test.columns = [''.join(lazy_pinyin(i)) for i in X_test.columns]
features = X_train.columns

cat_cols = ['位置', '出租方式', '区', '地铁站点', '地铁线路', '小区名', '居住状态', '房屋朝向', '装修情况']

cat_cols = [''.join(lazy_pinyin(i)) for i in cat_cols]
X_train[cat_cols] = X_train[cat_cols].astype('category')
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [5]:
# LGB模型五折、不同种子求平均
predictions_lgb = np.zeros((len(X_test)))
params = {
          'metric':'mean_squared_error', 
          'num_iterations': 30000, 
}
seeds = [2019, 2020, 2021]
for seed in seeds:
    oof_lgb = np.zeros(len(X_train))
    KF = KFold(n_splits=5, shuffle=True, random_state=seed)
    for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X_train.iloc[trn_idx][features],label=y.iloc[trn_idx])    
        val_data = lgb.Dataset(X_train.iloc[val_idx][features],label=y.iloc[val_idx])
        clf = lgb.train(
            params,
                        trn_data,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=200,  
            categorical_feature=cat_cols,
        )

        oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5 / len(seeds)
    print("MSE: {}".format(mean_squared_error(y, oof_lgb)))

fold n°0
Training until validation scores don't improve for 200 rounds
[500]	training's l2: 2.06844	valid_1's l2: 3.46719
[1000]	training's l2: 1.53799	valid_1's l2: 2.97422
[1500]	training's l2: 1.2055	valid_1's l2: 2.65272
[2000]	training's l2: 1.01638	valid_1's l2: 2.4873
[2500]	training's l2: 0.891195	valid_1's l2: 2.36498
[3000]	training's l2: 0.79192	valid_1's l2: 2.27656
[3500]	training's l2: 0.713878	valid_1's l2: 2.2024
[4000]	training's l2: 0.650374	valid_1's l2: 2.14163
[4500]	training's l2: 0.597055	valid_1's l2: 2.09618
[5000]	training's l2: 0.548916	valid_1's l2: 2.04985
[5500]	training's l2: 0.512664	valid_1's l2: 2.01904
[6000]	training's l2: 0.479886	valid_1's l2: 1.99834
[6500]	training's l2: 0.452136	valid_1's l2: 1.97841
[7000]	training's l2: 0.426955	valid_1's l2: 1.95711
[7500]	training's l2: 0.406609	valid_1's l2: 1.94059
[8000]	training's l2: 0.388589	valid_1's l2: 1.92467
[8500]	training's l2: 0.372715	valid_1's l2: 1.91367
[9000]	training's l2: 0.359268	valid_

[8500]	training's l2: 0.375661	valid_1's l2: 1.82833
[9000]	training's l2: 0.359958	valid_1's l2: 1.81932
[9500]	training's l2: 0.345659	valid_1's l2: 1.80787
[10000]	training's l2: 0.332621	valid_1's l2: 1.7996
[10500]	training's l2: 0.321835	valid_1's l2: 1.79264
[11000]	training's l2: 0.311232	valid_1's l2: 1.78786
[11500]	training's l2: 0.300863	valid_1's l2: 1.78084
Early stopping, best iteration is:
[11693]	training's l2: 0.29719	valid_1's l2: 1.77906
MSE: 1.6247076801577378
fold n°0
Training until validation scores don't improve for 200 rounds
[500]	training's l2: 2.17127	valid_1's l2: 3.19183
[1000]	training's l2: 1.59589	valid_1's l2: 2.65129
[1500]	training's l2: 1.27743	valid_1's l2: 2.36623
[2000]	training's l2: 1.05722	valid_1's l2: 2.14431
[2500]	training's l2: 0.924952	valid_1's l2: 2.02774
[3000]	training's l2: 0.817353	valid_1's l2: 1.92352
[3500]	training's l2: 0.734011	valid_1's l2: 1.84767
[4000]	training's l2: 0.672519	valid_1's l2: 1.79835
[4500]	training's l2: 0.

[12000]	training's l2: 0.285326	valid_1's l2: 1.6183
[12500]	training's l2: 0.277284	valid_1's l2: 1.6136
[13000]	training's l2: 0.270232	valid_1's l2: 1.60869
[13500]	training's l2: 0.263329	valid_1's l2: 1.60399
[14000]	training's l2: 0.25662	valid_1's l2: 1.60064
[14500]	training's l2: 0.250431	valid_1's l2: 1.59714
[15000]	training's l2: 0.244084	valid_1's l2: 1.59351
[15500]	training's l2: 0.238309	valid_1's l2: 1.59024
[16000]	training's l2: 0.233168	valid_1's l2: 1.58821
[16500]	training's l2: 0.228421	valid_1's l2: 1.58559
[17000]	training's l2: 0.2238	valid_1's l2: 1.58337
[17500]	training's l2: 0.220124	valid_1's l2: 1.5812
[18000]	training's l2: 0.21631	valid_1's l2: 1.57955
[18500]	training's l2: 0.212703	valid_1's l2: 1.57794
[19000]	training's l2: 0.208945	valid_1's l2: 1.57527
[19500]	training's l2: 0.205467	valid_1's l2: 1.57309
Early stopping, best iteration is:
[19721]	training's l2: 0.204081	valid_1's l2: 1.5722
fold n°4
Training until validation scores don't improve

[1500]	training's l2: 1.24286	valid_1's l2: 2.29914
[2000]	training's l2: 1.05637	valid_1's l2: 2.13955
[2500]	training's l2: 0.924634	valid_1's l2: 2.01993
[3000]	training's l2: 0.82602	valid_1's l2: 1.91908
[3500]	training's l2: 0.742311	valid_1's l2: 1.83235
[4000]	training's l2: 0.674345	valid_1's l2: 1.76662
[4500]	training's l2: 0.623387	valid_1's l2: 1.72126
[5000]	training's l2: 0.575589	valid_1's l2: 1.68071
[5500]	training's l2: 0.535648	valid_1's l2: 1.64075
[6000]	training's l2: 0.505216	valid_1's l2: 1.61862
[6500]	training's l2: 0.478233	valid_1's l2: 1.59729
[7000]	training's l2: 0.452807	valid_1's l2: 1.57804
[7500]	training's l2: 0.430044	valid_1's l2: 1.5574
[8000]	training's l2: 0.409266	valid_1's l2: 1.5404
[8500]	training's l2: 0.390962	valid_1's l2: 1.5256
[9000]	training's l2: 0.377188	valid_1's l2: 1.51543
[9500]	training's l2: 0.360784	valid_1's l2: 1.49731
[10000]	training's l2: 0.347144	valid_1's l2: 1.4881
[10500]	training's l2: 0.336298	valid_1's l2: 1.4806

In [7]:
# 提交
# 此处statistic_pred.csv文件来自 https://github.com/Daya-Jin/rental-prediction 仓库
import math
stc = pd.read_csv('statistic_pred.csv')
result = []
for i in range(len(predictions_lgb)):
    if math.isnan(stc['Rental'][i]):
        result.append(predictions_lgb[i])
    else:
        result.append(stc['Rental'][i])
        
submit = pd.read_csv('submit_example.csv')
submit['Label'] = result
submit.to_csv('submit_seed_stc.csv', index=False)