In [0]:
###################################################
# ラボワーク　ベースライン前処理スクリプト
###################################################

In [16]:
# colab用のスクリプト
from google.colab import drive 
drive.mount('/content/drive')
%cd drive/My\ Drive/labowork

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/My Drive/labowork'
/content/drive/My Drive/labowork


In [0]:
# import
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle

In [0]:
###########################
# train dataへの処理開始
###########################

In [0]:
# trainデータのimport
df = pd.read_csv("./dat/train.csv", index_col=0)
X_train = df.drop("Score", axis=1)
y_train = df.Score.copy()

In [0]:
# 1種類のcolumnsを削除
del_columns = X_train.columns[X_train.apply(lambda x: pd.unique(x).shape[0], axis=0) == 1]
X_train = X_train.drop(del_columns, axis=1)

In [0]:
# categorical / numerical columnsの取得
categorical_columns = X_train.columns[[X_train[col].dtype == np.int64 for col in X_train.columns]]
numerical_columns = X_train.columns[~X_train.columns.isin(categorical_columns)]

In [0]:
# 順序属性（風）なカテゴリを取得
order_columns = categorical_columns[(X_train[categorical_columns].max() - X_train[categorical_columns].min()) + 1 != X_train[categorical_columns].apply(lambda x: pd.unique(x).shape[0], axis=0)]
categorical_columns = categorical_columns[~categorical_columns.isin(order_columns)]

In [0]:
# one hot encodingを実行
X_train_preprocessed = pd.get_dummies(X_train, columns=categorical_columns)

In [0]:
# MinMaxScalingを実行
scaler = MinMaxScaler()
scaling_columns = order_columns.append(numerical_columns)
X_train_preprocessed[scaling_columns] = scaler.fit_transform(X_train_preprocessed[scaling_columns].values)

In [0]:
# 中間データを保存
np.save("./temp/preprocessed_train.npy", X_train_preprocessed.values)
np.save("./temp/target.npy", y_train.values)
with open("./temp/train_columns.csv", "w") as f:
  f.write(",".join(X_train_preprocessed.columns.values)+"\n")
with open("./temp/categorical_columns.csv", "w") as f:
  f.write(",".join(categorical_columns.values)+"\n")
with open("./temp/scaling_columns.csv", "w") as f:
  f.write(",".join(scaling_columns.values)+"\n")
with open("./temp/scaler.cmp", "wb") as f:
  pickle.dump(scaler, f)

In [0]:
###########################
# test dataへの処理開始
###########################

In [0]:
# test data, 中間データの読込
df = pd.read_csv("./dat/test.csv", index_col=0)
with open("./temp/train_columns.csv", "r") as f:
  all_columns = pd.Index(f.readline().replace("\n", "").split(","))
with open("./temp/categorical_columns.csv", "r") as f:
  categorical_columns = pd.Index(f.readline().replace("\n", "").split(","))
with open("./temp/scaling_columns.csv", "r") as f:
  scaling_columns = pd.Index(f.readline().replace("\n", "").split(","))

In [0]:
# categorical dataへone-hot encoding
df = pd.get_dummies(df, columns=categorical_columns)

In [46]:
df.shape, X_train_preprocessed.shape

((13732, 8133), (13731, 8077))

In [0]:
# 列の形を統一
df = df.drop(df.columns[~df.columns.isin(all_columns)], axis=1)
df = pd.concat([df, pd.DataFrame(index=df.index, columns=all_columns[~all_columns.isin(df.columns)])], axis=1).fillna(0)
df = df[all_columns].copy()

In [0]:
# scalerの適用
with open("./temp/scaler.cmp", "rb") as f:
  scaler = pickle.load(f)
df[scaling_columns] = scaler.transform(df[scaling_columns].values)

In [60]:
df.max()

col2          1.019608
col3          0.996503
col4          1.280000
col5          0.986556
col6          0.989324
col7          0.976248
col8          0.988542
col10         1.056506
col14         1.021766
col15         0.972552
col16         1.020973
col17         1.000000
col18         1.015660
col20         1.028781
col21         0.979086
col22         0.653727
col23         0.882234
col24         1.001330
col25         1.018055
col27         1.045776
col28         1.049317
col29         0.945999
col30         1.135388
col31         0.943598
col34         1.051282
col36         1.392449
col38         0.959073
col41         1.167690
col44         0.985696
col46         0.422182
                ...   
col3775_25    1.000000
col3775_26    1.000000
col3775_27    1.000000
col3778_0     1.000000
col3778_1     1.000000
col3779_0     1.000000
col3779_1     1.000000
col3783_0     1.000000
col3783_1     1.000000
col3783_2     1.000000
col3783_3     1.000000
col3783_4     1.000000
col3783_5  

In [0]:
# test dataの保存
np.save("./temp/preprocessed_test.npy", df.values)
np.save("./temp/test_index.npy", df.index.values)