In [19]:
# Googleドライブマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# データの読み込みと確認

In [20]:
# ライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

# データの読み込み
train = pd.read_csv('/content/drive/MyDrive/pm/data/input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/pm/data/input/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/pm/data/input/submit_sample.csv', header=None)

external_1 = pd.read_csv('/content/drive/MyDrive/pm/data/external/external1.csv')

# 学習データとテストデータの連結
df = pd.concat([train, test], sort=False).reset_index(drop=True)

# 特徴量エンジニアリング（特徴量追加）

## 人口、面積、人口密度データ（external_1）の統合

In [21]:
df = pd.merge(df, external_1, on=['City'], how='left')

## testのCityを近隣都市に置換

In [22]:
# イタリア
df['City'].replace({'Parma': 'Brescia'}, inplace=True)

# ドイツ
df['City'].replace({'Potsdam': 'Berlin'}, inplace=True)

# イラン
df['City'].replace({'Karaj': 'Tehran'}, inplace=True)

# ベトナム
df['City'].replace({'Haiphong': 'Hạ Long'}, inplace=True)

# アメリカ
df['City'].replace({'Chicago': 'Jackson', 
                    'Fresno':'San Jose',
                    'Tucson': 'Albuquerque',
                    'Charlotte': 'Raleigh'
                    }, inplace=True)

# イギリス
df['City'].replace({'London': 'Southend-on-Sea'}, inplace=True)

# トルコ
df['City'].replace({'Bursa': 'İzmit', 
                    'Trabzon':'Samsun',
                    'Adana': 'Antakya'
                    }, inplace=True)

# タイ
df['City'].replace({'Samut Prakan': 'Bangkok'}, inplace=True)

# 台湾
df['City'].replace({'Douliu': 'Taichung', 
                    'Tainan':'Kaohsiung',
                    }, inplace=True)

# スペイン
df['City'].replace({'Burgos': 'Gasteiz / Vitoria', 
                    'Santa Cruz de Tenerife':'Las Palmas de Gran Canaria',
                    'Málaga': 'Granada'
                    }, inplace=True)

# 韓国
df['City'].replace({'Daegu': 'Pohang', 
                    'Jeonju':'Daejeon',
                    'Sejong': 'Cheongju-si',
                    'Suwon': 'Seoul'
                    }, inplace=True)

# 南アフリカ
df['City'].replace({'Port Elizabeth': 'East London', 'Johannesburg': 'Pretoria'}, inplace=True)

# セルビア
df['City'].replace({'Belgrade': 'Novi Sad'}, inplace=True)

# ロシア
df['City'].replace({'Moscow': 'Nizhniy Novgorod'}, inplace=True)

# ポーランド
df['City'].replace({'Płock': 'Łódź', 'Katowice': 'Kraków'}, inplace=True)

# オランダ
df['City'].replace({'Amsterdam': 'Haarlem'}, inplace=True)

# メキシコ
df['City'].replace({'Cuernavaca': 'Toluca', 'Mexico City': 'Toluca'}, inplace=True)

# イスラエル
df['City'].replace({'Petaẖ Tiqwa': 'Netanya'}, inplace=True)

# インド
df['City'].replace({'Ghāziābād': 'Delhi', 'Mysore':'Bengaluru', 'Nagpur':'Bhopal', 'Nashik': 'Mumbai'}, inplace=True)

# ハンガリー
df['City'].replace({'Kecskemét': 'Budapest'}, inplace=True)

# フランス
df['City'].replace({'Paris': 'Nancy'}, inplace=True)

# 中国
df['City'].replace({'Changchun': 'Fushun', 
                    'Chongqing':'Chengdu',
                    'Dongguan': 'Foshan',
                    'Lanzhou': 'Xining',
                    'Shenyang': 'Fushun',
                    'Shijiazhuang': 'Jinan',
                    'Wuhan': 'Changsha',
                    'Wuxi': 'Zhuzhou',
                    'Xuchang': 'Zhengzhou',
                    'Anyang': 'Zhengzhou'
                    }, inplace=True)

# クロアチア
df['City'].replace({'Rijeka': 'Zagreb'}, inplace=True)

# チリ
df['City'].replace({'Concepción': 'Talca', 'Santiago': 'Quilpué'}, inplace=True)

# カナダ
df['City'].replace({'Surrey': 'Vancouver'}, inplace=True)

# ボスニア・ヘルツェゴビナ
df['City'].replace({'Sarajevo': 'Zenica'}, inplace=True)

# ブラジル
df['City'].replace({'São José dos Campos': 'São Paulo'}, inplace=True)

# ベルギー
df['City'].replace({'Brussels': 'Antwerpen'}, inplace=True)

# 日本
df['City'].replace({'Matsuyama': 'Hiroshima', 'Nagoya': 'Gifu-shi', 'Niigata': 'Kanazawa', 'Osaka': 'Kyoto', 'Kumamoto': 'Ōita', 'Saitama': 'Tokyo'}, inplace=True)

# オーストラリア
df['City'].replace({'Adelaide': 'Melbourne', 'Sydney': 'Wollongong'}, inplace=True)

## 北半球/南半球

In [23]:
# 北半球/南半球
df['hemisphere'] = df['lat'].apply(lambda x: 1 if x <= 0 else 0) # 1: 南半球 0: 北半球

## 季節

In [24]:
# 春夏秋冬
df['season'] = np.select([((df['hemisphere']==0) & ((df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5))) | ((df['hemisphere']==1) & ((df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11))), # 春
                          ((df['hemisphere']==0) & ((df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8))) | ((df['hemisphere']==1) & ((df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2))), # 夏
                          ((df['hemisphere']==0) & ((df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11))) | ((df['hemisphere']==1) & ((df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5))), # 秋
                          ((df['hemisphere']==0) & ((df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2))) | ((df['hemisphere']==1) & ((df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8)))], # 冬
                         ['spring',
                          'summer',
                          'autumn',
                          'winter']
)

# 都市別月平均

In [25]:
# 都市別の月平均
city_month_mean = train.groupby(['City', 'month']).agg({'pm25_mid': np.mean}).reset_index()

# カラム名の変更
city_month_mean = city_month_mean.rename(columns={'pm25_mid':'city_month_mean'})
#city_month_mean

# Target Encodingを付与
df = pd.merge(df, city_month_mean, on=['City', 'month'], how='left')

# 都市別年別月平均

In [26]:
# 都市別の月平均
city_year_month_mean = train.groupby(['City', 'year', 'month']).agg({'pm25_mid':np.mean}).reset_index()

# カラム名の変更
city_year_month_mean = city_year_month_mean.rename(columns={'pm25_mid':'city_year_month_mean'})
#city_year_month_mean

# Target Encodingを付与
df = pd.merge(df, city_year_month_mean, on=['City', 'year', 'month'], how='left')

## 国別月平均

In [27]:
# 国別の月平均
country_month_mean = train.groupby(['Country', 'month']).agg({'pm25_mid':np.mean}).reset_index()

# カラム名の変更
country_month_mean = country_month_mean.rename(columns={'pm25_mid':'country_month_mean'})
#country_month_mean

# Target Encodingを付与
df = pd.merge(df, country_month_mean, on=['Country', 'month'], how='left')

## 国別年別月平均

In [28]:
# 国別の年別月平均
country_year_month_mean = train.groupby(['Country', 'year', 'month']).agg({'pm25_mid':np.mean}).reset_index()

# カラム名の変更
country_year_month_mean = country_year_month_mean.rename(columns={'pm25_mid':'country_year_month_mean'})
#country_year_month_mean

# Target Encodingを付与
df = pd.merge(df, country_year_month_mean, on=['Country', 'year', 'month'], how='left')

## 国別月中央値

In [29]:
# 国別の月中央値
country_month_median = train.groupby(['Country', 'month']).agg({'pm25_mid':np.median}).reset_index()

# カラム名の変更
country_month_median = country_month_median.rename(columns={'pm25_mid':'country_month_median'})
#country_month_median

# Target Encodingを付与
df = pd.merge(df, country_month_median, on=['Country', 'month'], how='left')

# CSVファイルの出力

In [30]:
df

Unnamed: 0,id,year,month,day,Country,City,lat,lon,co_cnt,co_min,...,population,area,density,hemisphere,season,city_month_mean,city_year_month_mean,country_month_mean,country_year_month_mean,country_month_median
0,1,2019,1,1,Australia,Brisbane,-27.46794,153.02809,38,0.749,...,2350000,4673.2000,502.867414,1,summer,33.808158,23.133500,44.427941,40.310798,28.690
1,2,2019,1,1,Australia,Darwin,-12.46113,130.84185,47,2.594,...,140068,3163.8000,44.272078,1,summer,28.573727,22.349633,44.427941,40.310798,28.690
2,3,2019,1,1,Australia,Melbourne,-37.81400,144.96332,17,1.190,...,4641000,8806.0000,527.027027,1,summer,43.757293,40.569548,44.427941,40.310798,28.690
3,4,2019,1,1,Australia,Newcastle,-32.92953,151.78010,63,4.586,...,154777,261.8000,591.203209,1,summer,71.098849,55.471935,44.427941,40.310798,28.690
4,5,2019,1,1,Australia,Perth,-31.95224,115.86140,47,4.689,...,1980000,6418.0000,308.507323,1,summer,45.720391,38.078800,44.427941,40.310798,28.690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249445,249446,2021,12,31,Turkey,Antakya,36.98615,35.32531,97,4.674,...,1730000,1945.0000,889.460154,0,winter,112.648742,163.016484,84.940667,83.449246,68.519
249446,249447,2021,12,31,Turkey,İzmit,40.19559,29.06013,123,17.621,...,3101833,10422.0000,297.623585,0,winter,72.303220,60.006387,84.940667,83.449246,68.519
249447,249448,2021,12,31,Turkey,Samsun,41.00500,39.72694,150,0.128,...,312060,188.8500,1652.422558,0,winter,72.379655,70.846774,84.940667,83.449246,68.519
249448,249449,2021,12,31,United Kingdom,Southend-on-Sea,51.50853,-0.12574,21,0.057,...,8674000,1572.0000,5517.811705,0,winter,41.818333,46.862000,38.891258,38.630936,30.916


In [31]:
print(df.columns, len(df.columns))
print(df.info())

Index(['id', 'year', 'month', 'day', 'Country', 'City', 'lat', 'lon', 'co_cnt',
       'co_min', 'co_mid', 'co_max', 'co_var', 'o3_cnt', 'o3_min', 'o3_mid',
       'o3_max', 'o3_var', 'so2_cnt', 'so2_min', 'so2_mid', 'so2_max',
       'so2_var', 'no2_cnt', 'no2_min', 'no2_mid', 'no2_max', 'no2_var',
       'temperature_cnt', 'temperature_min', 'temperature_mid',
       'temperature_max', 'temperature_var', 'humidity_cnt', 'humidity_min',
       'humidity_mid', 'humidity_max', 'humidity_var', 'pressure_cnt',
       'pressure_min', 'pressure_mid', 'pressure_max', 'pressure_var',
       'ws_cnt', 'ws_min', 'ws_mid', 'ws_max', 'ws_var', 'dew_cnt', 'dew_min',
       'dew_mid', 'dew_max', 'dew_var', 'pm25_mid', 'population', 'area',
       'density', 'hemisphere', 'season', 'city_month_mean',
       'city_year_month_mean', 'country_month_mean', 'country_year_month_mean',
       'country_month_median'],
      dtype='object') 64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 249450 entries,

In [32]:
# データフレームの保存
df.to_csv('/content/drive/MyDrive/pm/data/processed/add_feature_df.csv', header=True, index=False)