In [113]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [68]:
pd.set_option('display.max_rows', 200)

0. Data processing  
    - histogram에 대한 처리: 대표값으로 대체
        - 평균, 분산
    - NA 값에 대한 처리: columnwise 적용. pos null 30%이상 -> pos median, 나머지 -> 전체의 median 
    - class imbalance: oversampling (SMOTE)
    - feature selection: drop over 40% na, 2장 내용 참고, logistic 회귀
    - feature reduction: PCA
    - columnwise normalizing

- dtype processing

In [119]:
data = pd.read_csv("./Train_data.csv", index_col=0)

# class to numeric; pos: 1, neg: 0
data['class'] = data['class'].replace('neg', 0)
data['class'] = data['class'].replace('pos', 1)

# na, numeric processing
data = data.replace('na', np.nan) # change 'na' ro np.nan
data = data.apply(pd.to_numeric) # change object to int or float

- na: process columnwise. 
    - Drop if null ratio over 0,4; gap btw 25~40%
    - If pos_null_ratio>=0.3, pos median. Else total median; If pos_null_ratio<0.3, many cols also have high neg_null_ratio
    

In [124]:
# check null columns per cats
pos_data = data[data["class"]==1] 
neg_data = data[data["class"]==0] 

pos_df, neg_df = pos_data.isnull().sum().to_frame(name="pos_null_cnt"), neg_data.isnull().sum().to_frame(name="neg_null_cnt")
null_check = pos_df.join(neg_df)
null_check["total_null"] = null_check["pos_null_cnt"] + null_check["neg_null_cnt"]
null_check.insert(1, "pos_cnt", len(pos_data))
null_check.insert(3, "neg_cnt", len(neg_data))

null_check["pos_null_ratio"] = null_check.pos_null_cnt/null_check.pos_cnt
null_check["neg_null_ratio"] = null_check.neg_null_cnt/null_check.neg_cnt
null_check["total_null_ratio"] = (null_check.pos_null_cnt + null_check.neg_null_cnt)/len(data)

# drop cols w/ too many na's
to_drop = null_check[null_check.total_null_ratio>=0.4]
data.drop(to_drop.index, inplace=True, axis=1)

# na replacement
POS_MEDIAN = pos_data.median(axis=0).to_dict()
TOTAL_MEDIAN = data.median(axis=0).to_dict()

with_pos = data.loc[:, null_check.pos_null_ratio>=0.4]
with_pos = with_pos.index
data.fillna(value=POS_MEDIAN, inplace=True)

with_total = data.loc[:, null_check.pos_null_ratio<0.4]
with_total = with_total.index
data.fillna(value=TOTAL_MEDIAN, inplace=True)

# data.isnull().sum().sum()

- Rowise Standardize

In [128]:
X = data.iloc[:, 1:]
y = data["class"]

scaler = StandardScaler()
scaler.fit_transform(X)
X_scaled = scaler.fit_transform(X) # standardizing if needed
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
X_scaled.describe()

Unnamed: 0,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,ag_004,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
count,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,...,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0
mean,4.3521810000000003e-17,1.023197e-16,-9.134969e-16,1.819474e-15,-2.8731650000000005e-17,1.769193e-15,-5.037638e-15,3.297087e-15,-6.553978e-15,3.8267150000000005e-17,...,-3.82033e-17,-1.233784e-16,-2.4771120000000002e-17,-3.58826e-18,-2.711257e-16,-1.003334e-16,3.149274e-16,1.371934e-15,-1.276193e-15,1.5732910000000002e-17
std,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,...,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009
min,-0.2628782,-0.4329847,-0.004202499,-0.04091289,-0.05254166,-0.01142451,-0.02377023,-0.05658439,-0.1182197,-0.1924195,...,-0.3924092,-0.3971752,-0.3861963,-0.364972,-0.3272284,-0.2059314,-0.296522,-0.1780667,-0.01951579,-0.02304906
25%,-0.2591822,-0.4329847,-0.004201331,-0.04091289,-0.05254166,-0.01142451,-0.02377023,-0.05658439,-0.1182197,-0.1922981,...,-0.3899072,-0.3950425,-0.3838522,-0.3616823,-0.3267009,-0.2058613,-0.296522,-0.1780667,-0.01951579,-0.02304906
50%,-0.1306353,-0.4329845,-0.004194489,-0.04091289,-0.05254166,-0.01142451,-0.02377023,-0.05658439,-0.1182197,-0.1909014,...,-0.1972482,-0.1970052,-0.2020596,-0.1994583,-0.2356962,-0.1806732,-0.2869486,-0.1780667,-0.01951579,-0.02304906
75%,-0.05297553,-0.4329836,-0.004184532,-0.04091289,-0.05254166,-0.01142451,-0.02377023,-0.05658439,-0.1182197,-0.1712852,...,-0.02794715,-0.008012186,-0.0009598749,-0.0176209,-0.06975556,-0.1058132,-0.00685279,-0.1375054,-0.01951579,-0.02304906
max,183.8947,2.309554,238.7446,127.7613,97.00465,160.6072,181.7643,102.344,83.06757,87.71011,...,62.54742,65.07378,77.39346,47.68775,37.75423,68.52664,36.28087,98.21978,126.7962,154.3036


- oversampling; SMOTE