<a href="https://colab.research.google.com/github/kentofujii/green_2025/blob/main/00_creansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# start

In [None]:
# prompt: google driveにマウントする

from google.colab import drive
drive.mount('/content/drive')


In [None]:
%pip install sktime

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sktime.utils.plotting import plot_series
from sktime.transformations.series.impute import Imputer
from sktime.forecasting.compose import make_reduction
from sklearn.linear_model import LinearRegression

In [None]:
from IPython.display import Markdown
import sys
sys.path.append("/content/drive/MyDrive/ドキュメント/コンペ/utils")
from MyFunc import opt_dtype_df, func_time, HistBox

In [None]:
today = "20250608"
my_folder = "/content/drive/MyDrive/ドキュメント/コンペ/SIGNATE/SMBC_GREEN_2025"
data_folder = os.path.join(my_folder, "data")
input_folder = os.path.join(data_folder, "input")
inter_folder = os.path.join(data_folder, "inter")
file_path_inter_train = os.path.join(inter_folder, f"inter_train_{today}.parq")
file_path_inter_test = os.path.join(inter_folder, f"inter_test_{today}.parq")
os.makedirs(inter_folder, exist_ok=True)
os.listdir(input_folder)

In [None]:
# データ読み込み
df_train = pd.read_csv(os.path.join(input_folder, "train.csv"), index_col=0)
print("="*30,"train_data","="*30)
print(df_train.shape)
display(df_train.head())

In [None]:
df_test = pd.read_csv(os.path.join(input_folder, "test.csv"), index_col=0)
print("="*30,"test_data","="*30)
print(df_test.shape)
display(df_test.head())

In [None]:
y_name="price_actual"

# 欠損の粗処理

In [None]:
display(df_train.isna().sum().sort_values(ascending=False))

In [None]:
#trainの全部が欠損であるカラムは除外
remove_cols = [x for x in df_train.columns if df_train[x].isna().all()]
print(remove_cols)
df_train.drop(remove_cols, axis=1, inplace=True)
print("="*30,"test_dataの欠損率","="*30)
display(df_test[remove_cols].isna().sum() / len(df_test))
df_test.drop(remove_cols, axis=1, inplace=True)
del remove_cols

# プライマリキー作成

In [None]:
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)
df_train.index = pd.PeriodIndex(df_train["time"], freq="h")
df_test.index = pd.PeriodIndex(df_test["time"], freq="h")
pk=["time"]

# yのチェック

In [None]:
HistBox(df_train[y_name])
plot_series(df_train[y_name])
plt.show()

# 各変数のクレンジング

In [None]:
finally_remove_cols = []
df_train_raw = df_train.copy()
df_test_raw = df_test.copy()

In [None]:
@func_time
def disp_markdown(number: int):
  """
  Markdownの表示

  Parameters
  ----------
  number : int
    カラム番号

  Returns
  -------
  number : int
    カラム番号
  col_name : str
    カラム名
  """
  number += 1
  display(Markdown(f"# {number} : {df_train_raw.columns[number]}"))
  return number,df_train_raw.columns[number]

In [None]:
@func_time
def make_dummy(dataframe : pd.DataFrame, col_name : str, drop=None):
  """
  ダミー変数を作成

  Parameters
  ----------
  dataframe : pd.DataFrame
    データフレーム
  col_name : str
    カラム名

  Returns
  -------
  new_dataframe : pd.DataFrame
    ダミー変数を作成したデータフレーム
  """
  exist_na = any(dataframe[col_name].isna())
  col_loc = dataframe.columns.get_loc(col_name)
  if drop is None:
    drop = not exist_na
  df_dummy = pd.get_dummies(dataframe[col_name], dummy_na=False, drop_first=drop, prefix=col_name).astype("uint8")
  display(df_dummy.head())
  new_dataframe = pd.concat([dataframe.iloc[:, :col_loc], df_dummy, dataframe.iloc[:, col_loc+1:]], axis=1)
  return new_dataframe

In [None]:
def compute_absolute_humidity(df, temp_col='temperature_K', rh_col='relative_humidity'):
    """
    Pandas DataFrame の温度(K)と相対湿度(%)から絶対湿度(g/m³)を計算し、新しい列として追加する。

    Parameters:
        df (pd.DataFrame): 入力データフレーム
        temp_col (str): 温度（K）の列名
        rh_col (str): 相対湿度（%）の列名

    Returns:
        pd.DataFrame: 'absolute_humidity' 列が追加された DataFrame
    """
    # 摂氏に変換
    temp_C = df[temp_col] - 273.15

    # 飽和水蒸気圧 e_s の計算（hPa）
    e_s = 6.112 * np.exp((17.62 * temp_C) / (temp_C + 243.12))

    # 実際の水蒸気圧 e（hPa）
    e = e_s * (df[rh_col] / 100.0)

    # 絶対湿度（g/m³）の計算
    ah = (216.7 * e) / df[temp_col]

    return ah


In [None]:
number = 0
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
na_dates = list(set([x.strftime("%Y-%m-%d") for x in df_train.index[df_train[col].isna()]]))
na_dates.sort()
filed = Imputer("ffill").fit_transform(df_train[col])  # 予測対象美の未来のデータは使えないと想定しffill
for date in na_dates:
  na_index = (f"{date} 00:00",f"{date} 23:00")
  plot_series(filed.loc[na_index[0]:na_index[1]], df_train.loc[na_index[0]:na_index[1], col])
  plt.show()


In [None]:
df_train[col] = Imputer("ffill").fit_transform(df_train[col])
df_test[col] = Imputer("ffill").fit_transform(df_test[col])

In [None]:
plot_series(df_train.filter(regex="^gene").sum(axis=1), df_train[col]), np.corrcoef(df_train.filter(regex="^gene").sum(axis=1), df_train[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
assert col.endswith("temp")
area = col.split("_")[0]

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
area_temp_train = df_train.filter(regex=f"^{area}_temp")
area_temp_test = df_test.filter(regex=f"^{area}_temp")
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_train.columns:
  ax.plot(area_temp_train.index.to_timestamp(), area_temp_train[col], alpha=0.3, label=col)
fig.legend()
plt.show()
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_test.columns:
  ax.plot(area_temp_test.index.to_timestamp(), area_temp_test[col], alpha=0.3, label=col)
fig.legend()
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
temp_col = f"{area}_temp"
df_train.insert(df_train.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_train, temp_col, col))
df_test.insert(df_test.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_test, temp_col, col))

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
# 角度をサインに変換する
sin_train = np.sin(df_train[col] / 360 * 2 *np.pi)
sin_test = np.sin(df_test[col] / 360 * 2 *np.pi)

In [None]:
HistBox(sin_train)
HistBox(sin_test)

In [None]:
plot_series(sin_train, sin_test)
plt.show()

In [None]:
df_train[col] = sin_train
df_test[col] = sin_test
df_train.rename(columns={col:"sin_"+col}, inplace=True)
df_test.rename(columns={col:"sin_"+col}, inplace=True)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
assert col.endswith("temp")

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
area_temp_train = df_train.filter(regex=f"^{area}_temp")
area_temp_test = df_test.filter(regex=f"^{area}_temp")
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_train.columns:
  ax.plot(area_temp_train.index.to_timestamp(), area_temp_train[col], alpha=0.3, label=col)
fig.legend()
plt.show()
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_test.columns:
  ax.plot(area_temp_test.index.to_timestamp(), area_temp_test[col], alpha=0.3, label=col)
fig.legend()
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
temp_col = f"{area}_temp"
df_train.insert(df_train.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_train, temp_col, col))
df_test.insert(df_test.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_test, temp_col, col))

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
# 角度をサインに変換する
sin_train = np.sin(df_train[col] / 360 * 2 *np.pi)
sin_test = np.sin(df_test[col] / 360 * 2 *np.pi)

In [None]:
HistBox(sin_train)
HistBox(sin_test)

In [None]:
plot_series(sin_train, sin_test)
plt.show()

In [None]:
df_train[col] = sin_train
df_test[col] = sin_test
df_train.rename(columns={col:"sin_"+col}, inplace=True)
df_test.rename(columns={col:"sin_"+col}, inplace=True)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
assert col.endswith("temp")

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
area_temp_train = df_train.filter(regex=f"^{area}_temp")
area_temp_test = df_test.filter(regex=f"^{area}_temp")
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_train.columns:
  ax.plot(area_temp_train.index.to_timestamp(), area_temp_train[col], alpha=0.3, label=col)
fig.legend()
plt.show()
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_test.columns:
  ax.plot(area_temp_test.index.to_timestamp(), area_temp_test[col], alpha=0.3, label=col)
fig.legend()
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
temp_col = f"{area}_temp"
df_train.insert(df_train.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_train, temp_col, col))
df_test.insert(df_test.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_test, temp_col, col))

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
# 角度をサインに変換する
sin_train = np.sin(df_train[col] / 360 * 2 *np.pi)
sin_test = np.sin(df_test[col] / 360 * 2 *np.pi)

In [None]:
HistBox(sin_train)
HistBox(sin_test)

In [None]:
plot_series(sin_train, sin_test)
plt.show()

In [None]:
df_train[col] = sin_train
df_test[col] = sin_test
df_train.rename(columns={col:"sin_"+col}, inplace=True)
df_test.rename(columns={col:"sin_"+col}, inplace=True)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
assert col.endswith("temp")

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
area_temp_train = df_train.filter(regex=f"^{area}_temp")
area_temp_test = df_test.filter(regex=f"^{area}_temp")
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_train.columns:
  ax.plot(area_temp_train.index.to_timestamp(), area_temp_train[col], alpha=0.3, label=col)
fig.legend()
plt.show()
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_test.columns:
  ax.plot(area_temp_test.index.to_timestamp(), area_temp_test[col], alpha=0.3, label=col)
fig.legend()
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
temp_col = f"{area}_temp"
df_train.insert(df_train.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_train, temp_col, col))
df_test.insert(df_test.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_test, temp_col, col))

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
# 角度をサインに変換する
sin_train = np.sin(df_train[col] / 360 * 2 *np.pi)
sin_test = np.sin(df_test[col] / 360 * 2 *np.pi)

In [None]:
HistBox(sin_train)
HistBox(sin_test)

In [None]:
plot_series(sin_train, sin_test)
plt.show()

In [None]:
df_train[col] = sin_train
df_test[col] = sin_test
df_train.rename(columns={col:"sin_"+col}, inplace=True)
df_test.rename(columns={col:"sin_"+col}, inplace=True)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
assert col.endswith("temp")

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
area_temp_train = df_train.filter(regex=f"^{area}_temp")
area_temp_test = df_test.filter(regex=f"^{area}_temp")
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_train.columns:
  ax.plot(area_temp_train.index.to_timestamp(), area_temp_train[col], alpha=0.3, label=col)
fig.legend()
plt.show()
fig, ax = plt.subplots(figsize=(15, 5))
for col in area_temp_test.columns:
  ax.plot(area_temp_test.index.to_timestamp(), area_temp_test[col], alpha=0.3, label=col)
fig.legend()
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
temp_col = f"{area}_temp"
df_train.insert(df_train.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_train, temp_col, col))
df_test.insert(df_test.columns.get_loc(col)+1, col + "_absolute", compute_absolute_humidity(df_test, temp_col, col))

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
# 角度をサインに変換する
sin_train = np.sin(df_train[col] / 360 * 2 *np.pi)
sin_test = np.sin(df_test[col] / 360 * 2 *np.pi)

In [None]:
HistBox(sin_train)
HistBox(sin_test)

In [None]:
plot_series(sin_train, sin_test)
plt.show()

In [None]:
df_train[col] = sin_train
df_test[col] = sin_test
df_train.rename(columns={col:"sin_"+col}, inplace=True)
df_test.rename(columns={col:"sin_"+col}, inplace=True)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
plot_series(df_train[col], df_test[col])
plt.show()

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
HistBox(df_train[col])
HistBox(df_test[col])

In [None]:
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

In [None]:
number, col = disp_markdown(number)

In [None]:
print("="*30,"value_counts","="*30)
display(df_train[col].value_counts(dropna=False).head())
display(df_test[col].value_counts(dropna=False).head())
print("="*30,"na","="*30)
print("train:",df_train[col].isna().sum())
print("test :",df_test[col].isna().sum())
print("="*30,"dtypes","="*30)
print(df_train[col].dtypes)
print(df_test[col].dtypes)

In [None]:
print(df_train[col].nunique(), df_test[col].nunique())
set(df_test[col]) - set(df_train[col])

In [None]:
insert_loc = df_test.columns.get_loc(col)
unique = df_train_raw[col].unique()
unique.sort()
for i, u in enumerate(unique):
  df_test.insert(insert_loc+i+1, f"{col}_{u}", (df_test[col]==u).astype("uint8"))
df_test.drop(col, axis=1, inplace=True)
df_train = make_dummy(df_train, col, drop=False)

# 出力

In [None]:
if len(finally_remove_cols) > 0:
  df_train.drop(finally_remove_cols, axis=1, inplace=True)
  df_test.drop(finally_remove_cols, axis=1, inplace=True)
df_train = opt_dtype_df(df_train)
df_test = opt_dtype_df(df_test)
df_train.to_parquet(file_path_inter_train)
df_test.to_parquet(file_path_inter_test)

In [None]:
file_path_inter_train

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
nas=df_train.isna().sum()
display(nas[nas>0])

In [None]:
nas=df_test.isna().sum()
display(nas[nas>0])