In [238]:
# Googleドライブマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# データの読み込みと確認

In [239]:
# 提出ファイル名
sub_name = 'pm_region'

# 削除するカラム
del_columns = ['pm25_mid', 'id']

In [240]:
# ライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

# ランダムシードの設定
import random
np.random.seed(1234)
random.seed(1234)

# データの読み込み
train = pd.read_csv('/content/drive/MyDrive/pm/data/input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/pm/data/input/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/pm/data/input/submit_sample.csv', header=None)

In [241]:
# 学習データとテストデータの連結
df = pd.concat([train, test], sort=False).reset_index(drop=True)

# 国と都市の確認用データフレーム

In [242]:
df_cc = df.copy()

In [243]:
df_cc['Country-City'] = df_cc['Country']+' - '+df_cc['City']

In [244]:
# trainとtestに再分割
train_cc = df_cc[~df_cc['pm25_mid'].isnull()]
test_cc = df_cc[df_cc['pm25_mid'].isnull()]

In [245]:
# 学習データの国と都市の組み合わせを確認
train_cc['Country-City'].unique()

array(['Australia - Brisbane', 'Australia - Darwin',
       'Australia - Melbourne', 'Australia - Newcastle',
       'Australia - Perth', 'Australia - Wollongong',
       'Belgium - Antwerpen', 'Belgium - Charleroi', 'Belgium - Liège',
       'Brazil - São Paulo', 'Brazil - Vitória', 'Canada - Vancouver',
       'Chile - Calama', 'Chile - Quilpué', 'Chile - Talca',
       'China - Beijing', 'China - Changsha', 'China - Chengdu',
       'China - Foshan', 'China - Fushun', 'China - Fuzhou',
       'China - Guangzhou', 'China - Guiyang', 'China - Haikou',
       'China - Hangzhou', 'China - Harbin', 'China - Hefei',
       'China - Hegang', 'China - Jieyang', 'China - Jinan',
       'China - Kunming', 'China - Lhasa', 'China - Nanchang',
       'China - Nanjing', 'China - Nanning', 'China - Ningbo',
       'China - Qingdao', 'China - Qinhuangdao', 'China - Qiqihar',
       'China - Shantou', 'China - Shenzhen', 'China - Suzhou',
       'China - Taiyuan', 'China - Xiamen', 'China - Xining'

In [286]:
# テストデータの国と都市の組み合わせを確認
print(test_cc['Country-City'].unique())
print(test_cc['Country-City'].nunique())

['Australia - Adelaide' 'Australia - Sydney' 'Belgium - Brussels'
 'Bosnia and Herzegovina - Sarajevo' 'Brazil - São José dos Campos'
 'Canada - Surrey' 'Chile - Concepción' 'Chile - Santiago'
 'China - Changchun' 'China - Chongqing' 'China - Dongguan'
 'China - Lanzhou' 'China - Shenyang' 'China - Shijiazhuang'
 'China - Wuhan' 'China - Wuxi' 'China - Xuchang' 'Croatia - Rijeka'
 'France - Paris' 'Hungary - Kecskemét' 'India - Ghāziābād'
 'India - Mysore' 'India - Nagpur' 'Israel - Petaẖ Tiqwa'
 'Japan - Matsuyama' 'Japan - Nagoya' 'Japan - Niigata' 'Japan - Osaka'
 'Mexico - Cuernavaca' 'Mexico - Mexico City' 'Netherlands - Amsterdam'
 'Poland - Płock' 'Russia - Moscow' 'Serbia - Belgrade'
 'South Africa - Port Elizabeth' 'South Korea - Daegu'
 'South Korea - Jeonju' 'South Korea - Sejong' 'South Korea - Suwon'
 'Spain - Burgos' 'Taiwan - Douliu' 'Taiwan - Tainan'
 'Thailand - Samut Prakan' 'Turkey - Bursa' 'Turkey - Trabzon'
 'United Kingdom - London' 'United States - Chicago'
 'Uni

# testのCityを近隣都市に置き換えたときのスコアの推移の確認

In [247]:
# ユニークな都市の数
print('ユニークな都市の数: ', df['City'].nunique())

ユニークな都市の数:  302


# イタリア
Parma→Brescia

In [248]:
# イタリア
df['City'].replace({'Parma': 'Brescia'}, inplace=True)

# ドイツ
Potsdam→Berlin

In [249]:
# ドイツ
df['City'].replace({'Potsdam': 'Berlin'}, inplace=True)

# イラン
Karaj→Tehran

In [250]:
# イラン
df['City'].replace({'Karaj': 'Tehran'}, inplace=True)

# ベトナム
Haiphong→Hạ Long

In [251]:
# ベトナム
df['City'].replace({'Haiphong': 'Hạ Long'}, inplace=True)

# アメリカ
Chicago→Jackson  
Fresno→San Jose  
Tucson→Albuquerque  
Charlotte→Raleigh

In [252]:
# アメリカ
df['City'].replace({'Chicago': 'Jackson', 
                    'Fresno':'San Jose',
                    'Tucson': 'Albuquerque',
                    'Charlotte': 'Raleigh'
                    }, inplace=True)

# イギリス
London→Southend-on-Sea

In [253]:
# イギリス
df['City'].replace({'London': 'Southend-on-Sea'}, inplace=True)

# トルコ
Bursa→İzmit  
Trabzon→Samsun  
Adana→Antakya

In [254]:
# トルコ
df['City'].replace({'Bursa': 'İzmit', 
                    'Trabzon':'Samsun',
                    'Adana': 'Antakya'
                    }, inplace=True)

# タイ
Samut Prakan→Bangkok

In [255]:
# タイ
df['City'].replace({'Samut Prakan': 'Bangkok'}, inplace=True)

# 台湾
Douliu→Taichung  
Tainan→Kaohsiung  

In [256]:
# 台湾
df['City'].replace({'Douliu': 'Taichung', 
                    'Tainan':'Kaohsiung',
                    }, inplace=True)

# スペイン
Burgos→Gasteiz / Vitoria  
Santa Cruz de Tenerife→Las Palmas de Gran Canaria  
Málaga→Granada  

In [257]:
# スペイン
df['City'].replace({'Burgos': 'Gasteiz / Vitoria', 
                    'Santa Cruz de Tenerife':'Las Palmas de Gran Canaria',
                    'Málaga': 'Granada'
                    }, inplace=True)

# 韓国
Daegu→Pohang  
Jeonju→Daejeon  
Sejong→Cheongju-si  
Suwon→Seoul  


In [258]:
# 韓国
df['City'].replace({'Daegu': 'Pohang', 
                    'Jeonju':'Daejeon',
                    'Sejong': 'Cheongju-si',
                    'Suwon': 'Seoul'
                    }, inplace=True)

# 南アフリカ
Port Elizabeth→East London  
Johannesburg→Pretoria  

In [259]:
# 南アフリカ
df['City'].replace({'Port Elizabeth': 'East London', 'Johannesburg': 'Pretoria'}, inplace=True)

# セルビア
Belgrade→Novi Sad

In [260]:
# セルビア
df['City'].replace({'Belgrade': 'Novi Sad'}, inplace=True)

# ロシア
Moscow→Nizhniy Novgorod

In [261]:
# ロシア
df['City'].replace({'Moscow': 'Nizhniy Novgorod'}, inplace=True)

# ポーランド
Płock→Łódź  
Katowice→Kraków

In [262]:
# ポーランド
df['City'].replace({'Płock': 'Łódź', 'Katowice': 'Kraków'}, inplace=True)

# オランダ
Amsterdam→Haarlem

In [263]:
# オランダ
df['City'].replace({'Amsterdam': 'Haarlem'}, inplace=True)

# メキシコ
Cuernavaca→Toluca  
Mexico City→Toluca  

In [264]:
# メキシコ
df['City'].replace({'Cuernavaca': 'Toluca', 'Mexico City': 'Toluca'}, inplace=True)

# イスラエル
Petaẖ Tiqwa→Netanya

In [265]:
# イスラエル
df['City'].replace({'Petaẖ Tiqwa': 'Netanya'}, inplace=True)

# インド
Ghāziābād→Delhi  
Mysore→Bengaluru  
Nagpur→Bhopal  
Nashik→Mumbai  

In [266]:
# インド
df['City'].replace({'Ghāziābād': 'Delhi', 'Mysore':'Bengaluru', 'Nagpur':'Bhopal', 'Nashik': 'Mumbai'}, inplace=True)

# ハンガリー
Kecskemét→Budapest

In [267]:
# ハンガリー
df['City'].replace({'Kecskemét': 'Budapest'}, inplace=True)

# フランス
Paris→Nancy

In [268]:
# フランス
df['City'].replace({'Paris': 'Nancy'}, inplace=True)

# 中国
Changchun→Fushun  
Chongqing→Chengdu  
Dongguan→Foshan  
Lanzhou→Xining  
Shenyang→Fushun  
Shijiazhuang→Jinan  
Wuhan→Changsha  
Wuxi→Zhuzhou  
Xuchang→Zhengzhou  
Anyang→Zhengzhou

In [269]:
# 中国
df['City'].replace({'Changchun': 'Fushun', 
                    'Chongqing':'Chengdu',
                    'Dongguan': 'Foshan',
                    'Lanzhou': 'Xining',
                    'Shenyang': 'Fushun',
                    'Shijiazhuang': 'Jinan',
                    'Wuhan': 'Changsha',
                    'Wuxi': 'Zhuzhou',
                    'Xuchang': 'Zhengzhou',
                    'Anyang': 'Zhengzhou'
                    }, inplace=True)

# クロアチア
Rijeka→Zagreb

In [270]:
# クロアチア
df['City'].replace({'Rijeka': 'Zagreb'}, inplace=True)

# チリ
Concepción→Talca  
Santiago→Quilpué


In [271]:
# チリ
df['City'].replace({'Concepción': 'Talca', 'Santiago': 'Quilpué'}, inplace=True)

# カナダ
Surrey→Vancouver

In [272]:
# カナダ
df['City'].replace({'Surrey': 'Vancouver'}, inplace=True)

# ボスニア・ヘルツェゴビナ
Sarajevo→Zenica

In [273]:
# ボスニア・ヘルツェゴビナ
df['City'].replace({'Sarajevo': 'Zenica'}, inplace=True)

# ブラジル
São José dos Campos→São Paulo

In [274]:
# ブラジル
df['City'].replace({'São José dos Campos': 'São Paulo'}, inplace=True)

# ベルギー
Brussels→Antwerpen

In [275]:
# ベルギー
df['City'].replace({'Brussels': 'Antwerpen'}, inplace=True)

# 日本
Nagoya→Gifu-shi  
Matsuyama→Hiroshima  
Niigata→Kanazawa  
Osaka→Kyoto  
Kumamoto→Ōita  
Saitama→Tokyo

In [276]:
# 日本
df['City'].replace({'Matsuyama': 'Hiroshima', 'Nagoya': 'Gifu-shi', 'Niigata': 'Kanazawa', 'Osaka': 'Kyoto', 'Kumamoto': 'Ōita', 'Saitama': 'Tokyo'}, inplace=True)

# オーストラリア
Adelaide→Melbourne  
Sydney→Wollongong

In [277]:
# オーストラリア
df['City'].replace({'Adelaide': 'Melbourne', 'Sydney': 'Wollongong'}, inplace=True)

In [278]:
print(df['City'].unique(),'ユニークな都市の数（変換後）: ',  df['City'].nunique())

['Brisbane' 'Darwin' 'Melbourne' 'Newcastle' 'Perth' 'Wollongong'
 'Antwerpen' 'Charleroi' 'Liège' 'São Paulo' 'Vitória' 'Vancouver'
 'Calama' 'Quilpué' 'Talca' 'Beijing' 'Changsha' 'Chengdu' 'Foshan'
 'Fushun' 'Fuzhou' 'Guangzhou' 'Guiyang' 'Haikou' 'Hangzhou' 'Harbin'
 'Hefei' 'Hegang' 'Jieyang' 'Jinan' 'Kunming' 'Lhasa' 'Nanchang' 'Nanjing'
 'Nanning' 'Ningbo' 'Qingdao' 'Qinhuangdao' 'Qiqihar' 'Shantou' 'Shenzhen'
 'Suzhou' 'Taiyuan' 'Xiamen' 'Xining' 'Xinxiang' 'Xi’an' 'Yinchuan'
 'Yunfu' 'Zhengzhou' 'Zhuzhou' 'Ürümqi' 'Marseille' 'Nancy' 'Nantes'
 'Strasbourg' 'Berlin' 'Hamburg' 'Budapest' 'Győr' 'Miskolc' 'Pécs'
 'Szeged' 'Bengaluru' 'Bhopal' 'Chandigarh' 'Delhi' 'Gandhinagar'
 'Hyderabad' 'Hāpur' 'Jaipur' 'Kolkata' 'Mumbai' 'Muzaffarnagar'
 'Thiruvananthapuram' 'Thrissur' 'Visakhapatnam' 'Yazd' 'Netanya'
 'Tel Aviv' 'Brescia' 'Livorno' 'Milan' 'Naples' 'Rome' 'Akita' 'Chiba'
 'Fukuoka' 'Gifu-shi' 'Hiroshima' 'Kagoshima' 'Kanazawa' 'Kobe' 'Kochi'
 'Kyoto' 'Miyazaki' 'Nagasaki' 'N

In [279]:
# object型の変数の取得
categories = df.columns[df.dtypes == 'object']
print(categories)

# label Encoding
for cat in categories:
    le = LabelEncoder() 
    print(cat)
    
    df[cat].fillna('missing', inplace=True)
    le = le.fit(df[cat])
    df[cat] = le.transform(df[cat])
    # LabelEncoderは数値に変換するだけであるため、最後にastype('category')としておく
    df[cat] = df[cat].astype('category') 

Index(['Country', 'City'], dtype='object')
Country
City


In [280]:
# trainとtestに再分割
train = df[~df['pm25_mid'].isnull()]
test = df[df['pm25_mid'].isnull()]

In [281]:
print(train.columns, len(train.columns))

Index(['id', 'year', 'month', 'day', 'Country', 'City', 'lat', 'lon', 'co_cnt',
       'co_min', 'co_mid', 'co_max', 'co_var', 'o3_cnt', 'o3_min', 'o3_mid',
       'o3_max', 'o3_var', 'so2_cnt', 'so2_min', 'so2_mid', 'so2_max',
       'so2_var', 'no2_cnt', 'no2_min', 'no2_mid', 'no2_max', 'no2_var',
       'temperature_cnt', 'temperature_min', 'temperature_mid',
       'temperature_max', 'temperature_var', 'humidity_cnt', 'humidity_min',
       'humidity_mid', 'humidity_max', 'humidity_var', 'pressure_cnt',
       'pressure_min', 'pressure_mid', 'pressure_max', 'pressure_var',
       'ws_cnt', 'ws_min', 'ws_mid', 'ws_max', 'ws_var', 'dew_cnt', 'dew_min',
       'dew_mid', 'dew_max', 'dew_var', 'pm25_mid'],
      dtype='object') 54


# モデルの構築と評価

In [282]:
# ライブラリのインポート
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from statistics import mean

# K分割する
folds = 20
kf = KFold(n_splits=folds)

# ハイパーパラメータの設定
params = {
    'objective':'regression',
    'random_seed':1234, 
}

# 説明変数と目的変数を指定
X_train = train.drop(del_columns, axis=1)
Y_train = train['pm25_mid']

# 各foldごとに作成したモデルごとの予測値を保存
models = []
rmses = []

for train_index, val_index in kf.split(X_train):
    x_train = X_train.iloc[train_index]
    x_valid = X_train.iloc[val_index]
    y_train = Y_train.iloc[train_index]
    y_valid = Y_train.iloc[val_index]
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)    
    
    model = lgb.train(params,
                      lgb_train, 
                      valid_sets=lgb_eval, 
                      num_boost_round=1000, # 学習回数の実行回数
                      early_stopping_rounds=20, # early_stoppingの判定基準
                      verbose_eval=10)
    
    y_pred = model.predict(x_valid, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(rmse)
    
    models.append(model)
    rmses.append(rmse)

# 平均AUCを計算する
print(mean(rmses))

# 特徴量重要度の表示
for model in models:
    lgb.plot_importance(model, importance_type='gain',
                        figsize=(10, 15),
                        max_num_features=len(train.columns))

Output hidden; open in https://colab.research.google.com to view.

# テストデータの予測

In [283]:
# テストデータの説明変数を指定
X_test = test.drop(del_columns, axis=1)
print(X_test.columns)

# テストデータにおける予測
preds = []

for model in models:
    pred = model.predict(X_test)
    preds.append(pred)

# predsの平均を計算
preds_array = np.array(preds)
preds_mean = np.mean(preds_array, axis=0)

Index(['year', 'month', 'day', 'Country', 'City', 'lat', 'lon', 'co_cnt',
       'co_min', 'co_mid', 'co_max', 'co_var', 'o3_cnt', 'o3_min', 'o3_mid',
       'o3_max', 'o3_var', 'so2_cnt', 'so2_min', 'so2_mid', 'so2_max',
       'so2_var', 'no2_cnt', 'no2_min', 'no2_mid', 'no2_max', 'no2_var',
       'temperature_cnt', 'temperature_min', 'temperature_mid',
       'temperature_max', 'temperature_var', 'humidity_cnt', 'humidity_min',
       'humidity_mid', 'humidity_max', 'humidity_var', 'pressure_cnt',
       'pressure_min', 'pressure_mid', 'pressure_max', 'pressure_var',
       'ws_cnt', 'ws_min', 'ws_mid', 'ws_max', 'ws_var', 'dew_cnt', 'dew_min',
       'dew_mid', 'dew_max', 'dew_var'],
      dtype='object')


# 提出

In [284]:
# 値を置き換え
submission[1] = preds_mean

# CSVファイルの出力
submission.to_csv(f'/content/drive/MyDrive/pm/submit/{sub_name}.csv', header=None, index=False)