In [21]:
# Googleドライブマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 事前準備

In [22]:
# 提出ファイル名
sub_name = 'pm_lightgbm'

# 削除するカラム
del_columns = ['pm25_mid',
               'id',
               'hemisphere',
               'city_month_mean',
               #'city_year_month_mean',
               'country_month_mean',
               'country_month_median',
               'country_year_month_mean'
]

# データの読み込みと確認

In [23]:
# ライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

# ランダムシードの設定
import random
np.random.seed(1234)
random.seed(1234)

# データの読み込み
df = pd.read_csv('/content/drive/MyDrive/pm/data/processed/processed_df.csv')
submission = pd.read_csv('/content/drive/MyDrive/pm/data/input/submit_sample.csv', header=None)

In [24]:
df

Unnamed: 0,id,year,month,day,Country,City,lat,lon,co_cnt,co_min,...,dew_max,dew_var,pm25_mid,hemisphere,season,city_month_mean,city_year_month_mean,country_month_mean,country_year_month_mean,country_month_median
0,1,2019,1,1,Australia,Brisbane,-27.46794,153.02809,3.663562,0.559044,...,15.112,13.424,19.901,1,summer,33.808158,23.133500,44.427941,40.310798,28.690
1,2,2019,1,1,Australia,Darwin,-12.46113,130.84185,3.871201,1.279266,...,24.221,2.021,13.741,1,summer,28.573727,22.349633,44.427941,40.310798,28.690
2,3,2019,1,1,Australia,Melbourne,-37.81400,144.96332,2.890372,0.783902,...,15.422,6.355,25.918,1,summer,43.757293,40.569548,44.427941,40.310798,28.690
3,4,2019,1,1,Australia,Newcastle,-32.92953,151.78010,4.158883,1.720263,...,13.344,9.417,174.370,1,summer,71.098849,55.471935,44.427941,40.310798,28.690
4,5,2019,1,1,Australia,Perth,-31.95224,115.86140,3.871201,1.738534,...,12.272,4.109,167.063,1,summer,45.720391,38.078800,44.427941,40.310798,28.690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249445,249446,2021,12,31,Turkey,Antakya,36.98615,35.32531,4.584967,1.735894,...,4.512,3.938,,0,winter,112.648742,163.016484,84.940667,83.449246,68.519
249446,249447,2021,12,31,Turkey,İzmit,40.19559,29.06013,4.820282,2.924290,...,6.861,2.145,,0,winter,72.303220,60.006387,84.940667,83.449246,68.519
249447,249448,2021,12,31,Turkey,Samsun,41.00500,39.72694,5.017280,0.120446,...,12.661,13.201,,0,winter,72.379655,70.846774,84.940667,83.449246,68.519
249448,249449,2021,12,31,United Kingdom,Southend-on-Sea,51.50853,-0.12574,3.091042,0.055435,...,12.744,0.822,,0,winter,41.818333,46.862000,38.891258,38.630936,30.916


# 特徴量エンジニアリング

In [25]:
# object型の変数の取得
categories = df.columns[df.dtypes == 'object']
print(categories)

# label Encoding
for cat in categories:
    le = LabelEncoder() 
    print(cat)
    
    df[cat].fillna('missing', inplace=True)
    le = le.fit(df[cat])
    df[cat] = le.transform(df[cat])
    # LabelEncoderは数値に変換するだけであるため、最後にastype('category')としておく
    df[cat] = df[cat].astype('category') 

Index(['Country', 'City', 'season'], dtype='object')
Country
City
season


In [26]:
# trainとtestに再分割
train = df[~df['pm25_mid'].isnull()]
test = df[df['pm25_mid'].isnull()]

In [27]:
print(train.columns, len(train.columns))

Index(['id', 'year', 'month', 'day', 'Country', 'City', 'lat', 'lon', 'co_cnt',
       'co_min', 'co_mid', 'co_max', 'co_var', 'o3_cnt', 'o3_min', 'o3_mid',
       'o3_max', 'o3_var', 'so2_cnt', 'so2_min', 'so2_mid', 'so2_max',
       'so2_var', 'no2_cnt', 'no2_min', 'no2_mid', 'no2_max', 'no2_var',
       'temperature_cnt', 'temperature_min', 'temperature_mid',
       'temperature_max', 'temperature_var', 'humidity_cnt', 'humidity_min',
       'humidity_mid', 'humidity_max', 'humidity_var', 'pressure_cnt',
       'pressure_min', 'pressure_mid', 'pressure_max', 'pressure_var',
       'ws_cnt', 'ws_min', 'ws_mid', 'ws_max', 'ws_var', 'dew_cnt', 'dew_min',
       'dew_mid', 'dew_max', 'dew_var', 'pm25_mid', 'hemisphere', 'season',
       'city_month_mean', 'city_year_month_mean', 'country_month_mean',
       'country_year_month_mean', 'country_month_median'],
      dtype='object') 61


# モデルの構築と評価

In [28]:
# ライブラリのインポート
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from statistics import mean

# K分割する
folds = 30
kf = KFold(n_splits=folds)

# ハイパーパラメータの設定
params = {
    'objective':'regression',
    'random_seed':1234, 
}

# 説明変数と目的変数を指定
X_train = train.drop(del_columns, axis=1)
Y_train = train['pm25_mid']

# 各foldごとに作成したモデルごとの予測値を保存
models = []
rmses = []

for train_index, val_index in kf.split(X_train):
    x_train = X_train.iloc[train_index]
    x_valid = X_train.iloc[val_index]
    y_train = Y_train.iloc[train_index]
    y_valid = Y_train.iloc[val_index]
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)    
    
    model = lgb.train(params,
                      lgb_train, 
                      valid_sets=lgb_eval, 
                      num_boost_round=1000, # 学習回数の実行回数
                      early_stopping_rounds=20, # early_stoppingの判定基準
                      verbose_eval=10)
    
    y_pred = model.predict(x_valid, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(rmse)
    
    models.append(model)
    rmses.append(rmse)

# 平均AUCを計算する
print(mean(rmses))

# 特徴量重要度の表示
for model in models:
    lgb.plot_importance(model, importance_type='gain',
                        figsize=(10, 15),
                        max_num_features=len(train.columns))

Output hidden; open in https://colab.research.google.com to view.

CV：19.500382449953143

# テストデータの予測

In [29]:
# テストデータの説明変数を指定
X_test = test.drop(del_columns, axis=1)
print(X_test.columns)

# テストデータにおける予測
preds = []

for model in models:
    pred = model.predict(X_test)
    preds.append(pred)

# predsの平均を計算
preds_array = np.array(preds)
preds_mean = np.mean(preds_array, axis=0)

Index(['year', 'month', 'day', 'Country', 'City', 'lat', 'lon', 'co_cnt',
       'co_min', 'co_mid', 'co_max', 'co_var', 'o3_cnt', 'o3_min', 'o3_mid',
       'o3_max', 'o3_var', 'so2_cnt', 'so2_min', 'so2_mid', 'so2_max',
       'so2_var', 'no2_cnt', 'no2_min', 'no2_mid', 'no2_max', 'no2_var',
       'temperature_cnt', 'temperature_min', 'temperature_mid',
       'temperature_max', 'temperature_var', 'humidity_cnt', 'humidity_min',
       'humidity_mid', 'humidity_max', 'humidity_var', 'pressure_cnt',
       'pressure_min', 'pressure_mid', 'pressure_max', 'pressure_var',
       'ws_cnt', 'ws_min', 'ws_mid', 'ws_max', 'ws_var', 'dew_cnt', 'dew_min',
       'dew_mid', 'dew_max', 'dew_var', 'season', 'city_year_month_mean'],
      dtype='object')


# 提出

In [30]:
# 値を置き換え
submission[1] = preds_mean

# CSVファイルの出力
submission.to_csv(f'/content/drive/MyDrive/pm/submit/{sub_name}.csv', header=None, index=False)