In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/3_5/train_preprocessed_onehot.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]

test_x = pd.read_csv("../data/3_5/test_preprocessed_onehot.csv")

train_x_saved = train_x.copy()
test_x_saved = test_x.copy()

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 標準化を行った学習データとテストデータを返す関数
def load_standarized_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()

    scaler = StandardScaler()
    scaler.fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)
    return pd.DataFrame(train_x), pd.DataFrame(test_x)


# MinMaxスケーリングを行った学習データとテストデータを返す関数
def load_minmax_scaled_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()

    # Min-Max Scalingを行う
    scaler = MinMaxScaler()
    scaler.fit(pd.concat([train_x, test_x], axis=0))
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    return pd.DataFrame(train_x), pd.DataFrame(test_x)

In [4]:
train

Unnamed: 0,age,sex,height,weight,product_0,product_1,product_2,product_3,product_4,product_5,...,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,medical_info_c1_nan,medical_info_c2_nan,target
0,50,1,166.445608,65.016732,0,0,0,0,0,0,...,1,0,0,2015,2,3,24182,False,True,0
1,68,0,164.334615,56.544217,1,0,0,0,0,0,...,1,0,0,2015,5,9,24185,True,True,0
2,77,1,167.462917,54.242267,0,0,1,0,0,0,...,1,0,0,2016,2,13,24194,False,True,1
3,17,1,177.097725,71.147762,0,0,0,1,0,0,...,1,0,0,2015,7,6,24187,False,False,0
4,62,0,158.165788,65.240697,0,1,0,0,0,0,...,1,1,0,2016,9,17,24201,False,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,61,1,182.729800,73.393777,0,1,0,0,0,0,...,1,1,0,2015,10,21,24190,False,True,0
9996,33,0,167.701136,75.006529,0,0,0,0,0,0,...,1,1,0,2015,5,28,24185,False,True,0
9997,44,0,145.609998,47.739397,0,0,0,0,0,0,...,1,0,1,2016,2,29,24194,True,True,0
9998,34,0,165.796017,57.567695,0,0,0,0,0,0,...,1,1,0,2016,2,27,24194,False,True,0


## 次元削減手法

### PCA
- 多次元データに対する次元削減の最も代表的な手法
- 多次元データを分散の大きい方向から順に軸を取り直す手法
- 変数間の従属性（共分散が大きい）が大きい場合、より少数の成分で元のデータを表現できる
- 各特徴量が正規分布に従っている状態を仮定している
    - 正規分布から外れた分布から生成された特徴量では不適切
- SVD（特異値分解）とほぼ同じ意味

In [5]:
from sklearn.decomposition import PCA

In [6]:
train_x, test_x = load_standarized_data()

In [7]:
pca = PCA(n_components=5)
pca.fit(train_x)

train_x = pca.transform(train_x)
test_x = pca.transform(test_x)

In [8]:
train_x

array([[ 2.02725331, -0.39459574, -1.06898039,  1.35778984,  1.72837079],
       [ 0.80265048, -1.36867841, -1.05923169,  1.71558985, -1.20539265],
       [-0.12013858,  0.1639616 , -1.09898592,  0.72494289,  0.58804167],
       ...,
       [-1.82280688, -1.95191103,  2.01903761,  1.76833332, -0.39522184],
       [-0.778437  , -0.60638592,  1.77097016, -1.08687611, -0.31822625],
       [ 2.14187249,  0.9813354 ,  0.59842047, -1.11242361, -0.38350612]])

### TruncatedSVD
- 特異値分解手法
- 疎行列を扱えるという利点からTruncatedSVDの方が多用される傾向がある

In [9]:
# TruncatedSVD
from sklearn.decomposition import TruncatedSVD

train_x, test_x = load_standarized_data()

svd = TruncatedSVD(n_components=5, random_state=71)
svd.fit(train_x)

train_x = svd.transform(train_x)
test_x = svd.transform(test_x)

train_x

array([[ 1.89075808,  0.56417433, -1.13840979,  1.45068067, -1.28003859],
       [ 0.6794261 ,  1.62337297, -0.97901713,  1.76923266,  1.35777081],
       [-0.11459337, -0.12142354, -0.93207289,  0.7748445 , -0.71874784],
       ...,
       [-1.91060851,  1.74590685,  1.90688932,  1.62966471,  0.1845628 ],
       [-0.86128199,  0.49292258,  1.72364079, -1.13753171,  0.15273637],
       [ 2.1747407 , -0.89991037,  0.49876797, -1.16032138,  1.21353394]])

In [10]:
train_x, test_x = load_minmax_scaled_data()


### NMF
- 

In [11]:
from sklearn.decomposition import NMF

model = NMF(n_components=5, init="random", random_state=71)
model.fit(train_x)

NMF(init='random', n_components=5, random_state=71)

In [12]:
train_x = model.transform(train_x)
test_x = model.transform(test_x)

In [13]:
test_x

array([[0.82905647, 0.60530404, 0.        , 0.00124627, 0.59139177],
       [0.        , 0.67276669, 0.37941642, 0.3038591 , 0.60274847],
       [0.11793269, 0.        , 0.52273175, 0.00945966, 0.        ],
       ...,
       [0.72050236, 0.61173093, 0.01239992, 0.30010142, 0.47019949],
       [0.78788243, 0.61327883, 0.02746983, 0.00512393, 0.00308494],
       [0.02046893, 0.        , 0.50988083, 0.0107185 , 0.55966738]])

In [14]:
train_x

array([[0.75986403, 0.        , 0.02349616, 0.        , 0.        ],
       [0.71034016, 0.01535425, 0.        , 0.00825695, 0.02167209],
       [0.66389753, 0.64093764, 0.14205363, 0.        , 0.44550723],
       ...,
       [0.67802549, 0.01105262, 0.        , 0.01086126, 0.50310889],
       [0.        , 0.01051369, 0.4818196 , 0.30903171, 0.48223883],
       [0.01433202, 0.62219057, 0.48536788, 0.        , 0.00637701]])

### LatentDirichletAllocation

In [15]:
train_x, test_x = load_minmax_scaled_data()

In [16]:
from sklearn.decomposition import LatentDirichletAllocation

In [17]:
model = LatentDirichletAllocation(n_components=5,random_state=71)
model.fit(train_x)

LatentDirichletAllocation(n_components=5, random_state=71)

In [18]:
train_x = model.transform(train_x)
test_x = model.transform(test_x)

In [19]:
train_x

array([[0.01574744, 0.0157792 , 0.93683928, 0.01593487, 0.01569921],
       [0.33338023, 0.62220041, 0.01477796, 0.01484837, 0.01479303],
       [0.68220031, 0.01120325, 0.01118684, 0.284196  , 0.01121359],
       ...,
       [0.3701719 , 0.01403994, 0.01406191, 0.5875934 , 0.01413285],
       [0.01317215, 0.17923686, 0.46731043, 0.32705925, 0.01322131],
       [0.01455362, 0.01459521, 0.43380725, 0.01464105, 0.52240287]])

In [20]:
test_x

array([[0.50195545, 0.01122961, 0.01126158, 0.01132943, 0.46422393],
       [0.01263978, 0.01264373, 0.94924557, 0.01278383, 0.0126871 ],
       [0.01406492, 0.94370447, 0.01407466, 0.01403559, 0.01412036],
       ...,
       [0.01225082, 0.01226618, 0.01225357, 0.95097413, 0.0122553 ],
       [0.01327285, 0.01323871, 0.01326013, 0.42427396, 0.53595436],
       [0.0129966 , 0.16036495, 0.01299578, 0.01302229, 0.80062039]])

### LinearDiscriminantAnalysis

In [21]:
train_x, test_x = load_standarized_data()

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [23]:
lda = LDA(n_components=1)
lda.fit(train_x,train_y)

LinearDiscriminantAnalysis(n_components=1)

In [24]:
train_x = lda.transform(train_x)
test_x = lda.transform(test_x)

In [25]:
train_x

array([[-0.23068727],
       [-0.36472287],
       [ 1.28408347],
       ...,
       [-0.85944936],
       [-0.53029345],
       [-0.46604975]])

In [26]:
test_x

array([[ 2.15722826],
       [-0.72909448],
       [-0.56988393],
       ...,
       [ 0.54299485],
       [-1.12300281],
       [ 3.5641705 ]])

In [28]:
# t-sne
train_x,test_x = load_standarized_data()
import bhtsne

In [29]:
data = pd.concat([train_x, test_x])
embedded = bhtsne.tsne(data.astype(np.float64), dimensions=2, rand_seed=71)

Using random seed: 71
Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
Computing input similarities...
Building tree...
 - point 0 of 20000
 - point 10000 of 20000
Input similarities computed in 31.13 seconds (sparsity = 0.006445)!
Learning embedding...
Iteration 50: error is 105.472853 (50 iterations in 13.65 seconds)
Iteration 100: error is 102.253786 (50 iterations in 15.06 seconds)
Iteration 150: error is 84.332794 (50 iterations in 14.88 seconds)
Iteration 200: error is 80.191661 (50 iterations in 15.10 seconds)
Iteration 250: error is 4.049825 (50 iterations in 15.26 seconds)
Iteration 300: error is 3.412777 (50 iterations in 15.38 seconds)
Iteration 350: error is 3.057512 (50 iterations in 16.08 seconds)
Iteration 400: error is 2.801079 (50 iterations in 16.26 seconds)
Iteration 450: error is 2.608678 (50 iterations in 16.31 seconds)
Iteration 500: error is 2.463655 (50 iterations in 16.18 seconds)
Iteration 550: error is 2.348479 (50 iterations in 16.12 seconds)


In [30]:
embedded

array([[ 11.75863725, -10.40675   ],
       [-38.20530917,  -0.6750682 ],
       [ 33.60175218,   5.97733285],
       ...,
       [  1.69003871, -26.50351329],
       [ 10.12737455, -32.31341371],
       [  2.15726501,  -0.78303724]])

In [31]:
# クラスタリング
train_x,test_x = load_standarized_data()

In [32]:
from sklearn.cluster import MiniBatchKMeans

In [33]:
kmeans = MiniBatchKMeans(n_clusters=10, random_state=71)

In [34]:
kmeans.fit(train_x)

MiniBatchKMeans(n_clusters=10, random_state=71)

In [35]:
train_cluster = kmeans.predict(train_x)
test_cluster = kmeans.predict(test_x)

In [36]:
train_distances = kmeans.transform(train_x)
test_distances = kmeans.transform(test_x)

In [39]:
train_distances

array([[ 7.22375489,  6.60258845,  8.34267445, ...,  8.31059596,
         7.71554795,  8.35024336],
       [ 7.28860904,  8.01608603,  8.48136581, ...,  8.5662797 ,
         8.01397808,  8.502871  ],
       [ 7.60952004,  8.19247266,  8.81986473, ...,  8.75887413,
         8.21306144,  8.69430836],
       ...,
       [ 8.90352655,  8.04684447,  9.54018745, ...,  9.53678396,
         9.013434  ,  9.57640555],
       [ 8.09338653,  8.23533606,  8.6634961 , ...,  8.60150903,
         8.14500752,  8.62718583],
       [ 9.81036997,  9.2594205 , 10.51980413, ..., 10.5641513 ,
        10.13197798, 10.53600106]])