In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/3_5/train_preprocessed_onehot.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]

test_x = pd.read_csv("../data/3_5/test_preprocessed_onehot.csv")

train_x_saved = train_x.copy()
test_x_saved = test_x.copy()

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 標準化を行った学習データとテストデータを返す関数
def load_standarized_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()

    scaler = StandardScaler()
    scaler.fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)
    return pd.DataFrame(train_x), pd.DataFrame(test_x)


# MinMaxスケーリングを行った学習データとテストデータを返す関数
def load_minmax_scaled_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()

    # Min-Max Scalingを行う
    scaler = MinMaxScaler()
    scaler.fit(pd.concat([train_x, test_x], axis=0))
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    return pd.DataFrame(train_x), pd.DataFrame(test_x)

In [4]:
train

Unnamed: 0,age,sex,height,weight,product_0,product_1,product_2,product_3,product_4,product_5,...,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,medical_info_c1_nan,medical_info_c2_nan,target
0,50,1,166.445608,65.016732,0,0,0,0,0,0,...,1,0,0,2015,2,3,24182,False,True,0
1,68,0,164.334615,56.544217,1,0,0,0,0,0,...,1,0,0,2015,5,9,24185,True,True,0
2,77,1,167.462917,54.242267,0,0,1,0,0,0,...,1,0,0,2016,2,13,24194,False,True,1
3,17,1,177.097725,71.147762,0,0,0,1,0,0,...,1,0,0,2015,7,6,24187,False,False,0
4,62,0,158.165788,65.240697,0,1,0,0,0,0,...,1,1,0,2016,9,17,24201,False,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,61,1,182.729800,73.393777,0,1,0,0,0,0,...,1,1,0,2015,10,21,24190,False,True,0
9996,33,0,167.701136,75.006529,0,0,0,0,0,0,...,1,1,0,2015,5,28,24185,False,True,0
9997,44,0,145.609998,47.739397,0,0,0,0,0,0,...,1,0,1,2016,2,29,24194,True,True,0
9998,34,0,165.796017,57.567695,0,0,0,0,0,0,...,1,1,0,2016,2,27,24194,False,True,0


## 次元削減手法

### PCA
- 多次元データに対する次元削減の最も代表的な手法
- 多次元データを分散の大きい方向から順に軸を取り直す手法
- 変数間の従属性（共分散が大きい）が大きい場合、より少数の成分で元のデータを表現できる
- 各特徴量が正規分布に従っている状態を仮定している
    - 正規分布から外れた分布から生成された特徴量では不適切
- SVD（特異値分解）とほぼ同じ意味

In [5]:
from sklearn.decomposition import PCA

In [6]:
train_x, test_x = load_standarized_data()

In [7]:
pca = PCA(n_components=5)
pca.fit(train_x)

train_x = pca.transform(train_x)
test_x = pca.transform(test_x)

In [8]:
train_x

array([[ 2.02854383, -0.39096234, -1.09232565,  1.35436633,  1.69088335],
       [ 0.82944669, -1.36213627, -1.07836709,  1.62582406, -0.98974574],
       [-0.1281039 ,  0.18004558, -1.08277735,  0.66899225, -0.04301818],
       ...,
       [-1.80012356, -1.92623692,  1.92780828,  1.85396994, -0.41959122],
       [-0.77046836, -0.64623822,  1.79942586, -0.99850173, -0.20624952],
       [ 2.11921004,  1.00235445,  0.58853921, -1.25046747, -0.3865942 ]])

### TruncatedSVD
- 特異値分解手法
- 疎行列を扱えるという利点からTruncatedSVDの方が多用される傾向がある

In [9]:
# TruncatedSVD
from sklearn.decomposition import TruncatedSVD

train_x, test_x = load_standarized_data()

svd = TruncatedSVD(n_components=5, random_state=71)
svd.fit(train_x)

train_x = svd.transform(train_x)
test_x = svd.transform(test_x)

train_x

array([[ 1.89075808,  0.56417433, -1.13840979,  1.45068067, -1.28003859],
       [ 0.6794261 ,  1.62337297, -0.97901713,  1.76923266,  1.35777081],
       [-0.11459337, -0.12142354, -0.93207289,  0.7748445 , -0.71874784],
       ...,
       [-1.91060851,  1.74590685,  1.90688932,  1.62966471,  0.1845628 ],
       [-0.86128199,  0.49292258,  1.72364079, -1.13753171,  0.15273637],
       [ 2.1747407 , -0.89991037,  0.49876797, -1.16032138,  1.21353394]])

In [10]:
train_x, test_x = load_minmax_scaled_data()


In [11]:
from sklearn.decomposition import NMF

model = NMF(n_components=5, init="random", random_state=71)
model.fit(train_x)

NMF(init='random', n_components=5, random_state=71)

In [12]:
train_x = model.transform(train_x)
test_x = model.transform(test_x)

In [14]:
test_x

array([[0.82905647, 0.60530404, 0.        , 0.00124627, 0.59139177],
       [0.        , 0.67276669, 0.37941642, 0.3038591 , 0.60274847],
       [0.11793269, 0.        , 0.52273175, 0.00945966, 0.        ],
       ...,
       [0.72050236, 0.61173093, 0.01239992, 0.30010142, 0.47019949],
       [0.78788243, 0.61327883, 0.02746983, 0.00512393, 0.00308494],
       [0.02046893, 0.        , 0.50988083, 0.0107185 , 0.55966738]])

In [15]:
train_x

array([[0.75986403, 0.        , 0.02349616, 0.        , 0.        ],
       [0.71034016, 0.01535425, 0.        , 0.00825695, 0.02167209],
       [0.66389753, 0.64093764, 0.14205363, 0.        , 0.44550723],
       ...,
       [0.67802549, 0.01105262, 0.        , 0.01086126, 0.50310889],
       [0.        , 0.01051369, 0.4818196 , 0.30903171, 0.48223883],
       [0.01433202, 0.62219057, 0.48536788, 0.        , 0.00637701]])