In [1]:
conda install lightgbm

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [35]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import datetime
import os

from data_processing import Data

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics

In [27]:
available_data = Data('train').data
prediction_data = Data('test').data

In [20]:
available_data

Unnamed: 0,id,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,0,2.3859,15.0,3.827160,1.112100,1280.0,2.486989,34.60,-120.12,0.980
1,1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946
2,2,4.7750,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576
3,3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336
4,4,3.7500,52.0,4.284404,1.069246,1793.0,1.604790,37.80,-122.41,4.500
...,...,...,...,...,...,...,...,...,...,...
37132,37132,3.3438,50.0,4.936508,1.079365,1775.0,3.022222,34.19,-118.36,1.856
37133,37133,3.7308,26.0,5.087533,0.966019,1006.0,4.316901,37.32,-121.86,1.588
37134,37134,4.1716,52.0,4.678862,1.101485,1156.0,1.431734,37.75,-122.44,3.387
37135,37135,2.7143,16.0,5.710074,1.068376,584.0,2.803659,38.40,-120.98,1.592


# LightGBM Trial

In [21]:
available_data.isnull().sum()

id             0
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [38]:
class Data:
    def __init__(self, data_type: str = 'train'):
        self.data_type = data_type
        self.data = pd.read_csv('train.csv', index_col=0) if data_type == 'train' else pd.read_csv('test.csv',
                                                                                                   index_col=0)

    @staticmethod
    def get_train_test_split(test_size: float = 0.3, random_state: int = 42):
        X = pd.read_csv('train.csv', index_col=0).drop('MedHouseVal', axis=1)
        y = pd.read_csv('train.csv', index_col=0)['MedHouseVal']
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return x_train, x_test, y_train, y_test

In [39]:
data = Data('train')
x_train, x_test, y_train, y_test = data.get_train_test_split()

In [55]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 30,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('开始训练...')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=30,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

print('保存模型...')
gbm.save_model('lgb_model.txt')

print('开始预测...')
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)

print('预估结果的rmse为:')
print(mean_squared_error(y_test, y_pred) ** 0.5)

开始训练...
You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's l2: 1.18907	valid_0's auc: 1
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 1.06805	valid_0's auc: 1
[3]	valid_0's l2: 0.954586	valid_0's auc: 1
[4]	valid_0's l2: 0.872976	valid_0's auc: 1
[5]	valid_0's l2: 0.794956	valid_0's auc: 1
[6]	valid_0's l2: 0.729541	valid_0's auc: 1
Early stopping, best iteration is:
[1]	valid_0's l2: 1.18907	valid_0's auc: 1
保存模型...
开始预测...
预估结果的rmse为:
1.0904448638144022
