In [113]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [68]:
pd.set_option('display.max_rows', 200)

0. Data processing  
    - histogram에 대한 처리: 대표값으로 대체
        - 평균, 분산
    - NA 값에 대한 처리: columnwise 적용. pos null 30%이상 -> pos median, 나머지 -> 전체의 median 
    - class imbalance: oversampling (SMOTE)
    - feature selection: drop over 40% na, 2장 내용 참고, logistic 회귀
    - feature reduction: PCA
    - columnwise normalizing

- dtype processing

In [119]:
data = pd.read_csv("./Train_data.csv", index_col=0)

# class to numeric; pos: 1, neg: 0
data['class'] = data['class'].replace('neg', 0)
data['class'] = data['class'].replace('pos', 1)

# na, numeric processing
data = data.replace('na', np.nan) # change 'na' ro np.nan
data = data.apply(pd.to_numeric) # change object to int or float

- na: process columnwise. 
    - Drop if null ratio over 0,4; gap btw 25~40%
    - If pos_null_ratio>=0.3, pos median. Else total median; If pos_null_ratio<0.3, many cols also have high neg_null_ratio
    

In [111]:
# check null columns per cats
pos_data = data[data["class"]==1] 
neg_data = data[data["class"]==0] 

pos_df, neg_df = pos_data.isnull().sum().to_frame(name="pos_null_cnt"), neg_data.isnull().sum().to_frame(name="neg_null_cnt")
null_check = pos_df.join(neg_df)
null_check["total_null"] = null_check["pos_null_cnt"] + null_check["neg_null_cnt"]
null_check.insert(1, "pos_cnt", len(pos_data))
null_check.insert(3, "neg_cnt", len(neg_data))

null_check["pos_null_ratio"] = null_check.pos_null_cnt/null_check.pos_cnt
null_check["neg_null_ratio"] = null_check.neg_null_cnt/null_check.neg_cnt
null_check["total_null_ratio"] = (null_check.pos_null_cnt + null_check.neg_null_cnt)/len(data)

# drop cols w/ too many na's
to_drop = null_check[null_check.total_null_ratio>=0.4]
data.drop(to_drop.index, inplace=True, axis=1)

# na replacement
POS_MEDIAN = pos_data.median(axis=0).to_dict()
TOTAL_MEDIAN = data.median(axis=0).to_dict()

with_pos = data.loc[:, null_check.pos_null_ratio>=0.4]
with_pos = with_pos.index
data.fillna(value=POS_MEDIAN, inplace=True)

with_total = data.loc[:, null_check.pos_null_ratio<0.4]
with_total = with_total.index
data.fillna(value=TOTAL_MEDIAN, inplace=True)

# data.isnull().sum().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57000 entries, 52803 to 37147
Columns: 162 entries, class to eg_000
dtypes: float64(160), int64(2)
memory usage: 70.9 MB


- oversampling; SMOTE

In [116]:
X = data["class"]
y = data.iloc[:, 1:]

scaler = StandardScaler()
scaler.fit_transform(X)
X_scaled = scaler.fit_transform(x) # standardizing if needed
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
X_scaled

Unnamed: 0,class,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
52803,-0.138051,-0.085425,-0.432984,-0.004189,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.118220,...,-0.038607,-0.046620,-0.079493,-0.036307,-0.120807,-0.146024,-0.035743,-0.028997,-0.019516,-0.023049
38189,-0.138051,-0.135892,-0.432983,-0.004161,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.118220,...,-0.274883,-0.270843,-0.249029,-0.107012,0.015533,-0.083028,-0.284254,-0.171780,-0.019516,-0.023049
23291,-0.138051,0.771980,-0.432984,-0.004185,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.106674,...,2.528939,3.899144,2.521484,0.469546,0.066942,-0.089824,0.073493,-0.148442,-0.019516,-0.023049
16862,-0.138051,-0.228147,-0.432985,-0.004200,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.118220,...,-0.338316,-0.333612,-0.312623,-0.292490,-0.272783,-0.192357,-0.280521,-0.178024,-0.019516,-0.023049
14055,-0.138051,-0.253059,-0.432984,-0.004190,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.118220,...,-0.383088,-0.384354,-0.374823,-0.306748,-0.324421,-0.205869,-0.296509,-0.178067,-0.019516,-0.023049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43966,-0.138051,-0.256549,-0.432985,-0.004199,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.118220,...,-0.385703,-0.384798,-0.343541,-0.363642,-0.327039,-0.205910,-0.296313,-0.178067,-0.019516,-0.023049
4128,-0.138051,-0.262870,-0.432985,-0.004202,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.118220,...,-0.392280,-0.397134,-0.386164,-0.364960,-0.327221,-0.205931,-0.296522,-0.178067,-0.019516,-0.023049
34715,7.243681,2.592208,-0.432984,-0.004185,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.073150,...,-0.063073,-0.104447,-0.023673,0.305514,0.602990,0.521411,-0.235391,-0.178067,-0.019516,-0.023049
56900,-0.138051,-0.089481,-0.432985,-0.004201,-0.040913,-0.052542,-0.011425,-0.02377,-0.056584,-0.118220,...,-0.118129,-0.085384,-0.087445,-0.133575,-0.109457,-0.112292,0.204012,0.044547,-0.019516,-0.023049
