In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [6]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

### 0. Data processing  
    - histogram에 대한 처리: 대표값으로 대체
        - 평균, 분산
    - NA 값에 대한 처리: columnwise 적용. pos null 30%이상 -> pos median, 나머지 -> 전체의 median 
    - class imbalance: oversampling (SMOTE)
    - feature selection: drop over 40% na, 2장 내용 참고, logistic 회귀
    - feature reduction: PCA
    - columnwise normalizing

#### 1) dtype processing

In [7]:
data = pd.read_csv("./Train_data.csv", index_col=0)

# class to numeric; pos: 1, neg: 0
data['class'] = data['class'].replace('neg', 0)
data['class'] = data['class'].replace('pos', 1)

# na, numeric processing
data = data.replace('na', np.nan) # change 'na' ro np.nan
data = data.apply(pd.to_numeric) # change object to int or float

#### 2) Feature Selection
- Drop if null ratio over 0,4; gap btw 25~40%

#### 3) NA's: replace depending on column characteristic.
- If pos_null_ratio>=0.3, pos median. Else total median; If pos_null_ratio<0.3, many cols also have high neg_null_ratio
    

In [8]:
# check null columns per cats
pos_data = data[data["class"]==1] 
neg_data = data[data["class"]==0] 

pos_df, neg_df = pos_data.isnull().sum().to_frame(name="pos_null_cnt"), neg_data.isnull().sum().to_frame(name="neg_null_cnt")
null_check = pos_df.join(neg_df)
null_check["total_null"] = null_check["pos_null_cnt"] + null_check["neg_null_cnt"]
null_check.insert(1, "pos_cnt", len(pos_data))
null_check.insert(3, "neg_cnt", len(neg_data))

null_check["pos_null_ratio"] = null_check.pos_null_cnt/null_check.pos_cnt
null_check["neg_null_ratio"] = null_check.neg_null_cnt/null_check.neg_cnt
null_check["total_null_ratio"] = (null_check.pos_null_cnt + null_check.neg_null_cnt)/len(data)

# drop cols w/ too many na's
to_drop = null_check[null_check.total_null_ratio>=0.4]
data.drop(to_drop.index, inplace=True, axis=1)

# na replacement
POS_MEDIAN = pos_data.median(axis=0).to_dict()
TOTAL_MEDIAN = data.median(axis=0).to_dict()

with_pos = data.loc[:, null_check.pos_null_ratio>=0.4]
with_pos = with_pos.index
data.fillna(value=POS_MEDIAN, inplace=True)

with_total = data.loc[:, null_check.pos_null_ratio<0.4]
with_total = with_total.index
data.fillna(value=TOTAL_MEDIAN, inplace=True)

# data.isnull().sum().sum()

#### 4)Standardize_ Rowise
`minmax scaling과 성능 차이 시도 가능`

In [9]:
X = data.iloc[:, 1:]
y = data["class"]

scaler = StandardScaler()
scaler.fit_transform(X)
X_scaled = scaler.fit_transform(X) # standardizing if needed
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
X_scaled.describe().applymap("{0:.2f}".format)

Unnamed: 0,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,ag_004,ag_005,ag_006,ag_007,ag_008,ag_009,ah_000,ai_000,aj_000,ak_000,al_000,am_0,an_000,ao_000,ap_000,aq_000,ar_000,as_000,at_000,au_000,av_000,ax_000,ay_000,ay_001,ay_002,ay_003,ay_004,ay_005,ay_006,ay_007,ay_008,ay_009,az_000,az_001,az_002,az_003,az_004,az_005,az_006,az_007,az_008,az_009,ba_000,ba_001,ba_002,ba_003,ba_004,ba_005,ba_006,ba_007,ba_008,ba_009,bb_000,bc_000,bd_000,be_000,bf_000,bg_000,bh_000,bi_000,bj_000,bk_000,bs_000,bt_000,bu_000,bv_000,bx_000,by_000,bz_000,ca_000,cb_000,cc_000,cd_000,ce_000,cf_000,cg_000,ch_000,ci_000,cj_000,ck_000,cl_000,cm_000,cn_000,cn_001,cn_002,cn_003,cn_004,cn_005,cn_006,cn_007,cn_008,cn_009,co_000,cp_000,cq_000,cs_000,cs_001,cs_002,cs_003,cs_004,cs_005,cs_006,cs_007,cs_008,cs_009,ct_000,cu_000,cv_000,cx_000,cy_000,cz_000,da_000,db_000,dc_000,dd_000,de_000,df_000,dg_000,dh_000,di_000,dj_000,dk_000,dl_000,dm_000,dn_000,do_000,dp_000,dq_000,dr_000,ds_000,dt_000,du_000,dv_000,dx_000,dy_000,dz_000,ea_000,eb_000,ec_00,ed_000,ee_000,ee_001,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
count,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0,57000.0
mean,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.26,-0.43,-0.0,-0.04,-0.05,-0.01,-0.02,-0.06,-0.12,-0.19,-0.35,-0.43,-0.35,-0.16,-0.03,-0.44,-0.05,-0.02,-0.01,-0.12,-0.12,-0.45,-0.45,-0.34,-0.36,-0.09,-0.01,-0.04,-0.01,-0.18,-0.27,-0.02,-0.02,-0.03,-0.04,-0.03,-0.08,-0.32,-0.31,-0.27,-0.01,-0.11,-0.14,-0.08,-0.13,-0.36,-0.34,-0.11,-0.06,-0.04,-0.01,-0.38,-0.39,-0.36,-0.38,-0.39,-0.38,-0.34,-0.36,-0.15,-0.1,-0.42,-0.17,-0.23,-0.16,-0.15,-0.44,-0.39,-0.34,-0.3,-1.48,-0.96,-0.27,-0.42,-0.42,-0.46,-0.42,-0.16,-1.14,-1.11,-0.46,0.0,-0.44,-0.0,-0.39,-0.02,-0.42,-0.09,-0.34,-0.08,-0.32,-0.04,-0.08,-0.15,-0.25,-0.39,-0.44,-0.33,-0.16,-0.1,-0.03,-0.0,-0.08,-0.42,-0.53,-0.3,-0.21,-0.33,-0.22,-0.41,-0.48,-0.24,-0.03,-0.0,-0.18,-0.25,-0.81,-0.57,-0.03,-0.07,-0.03,-0.16,-0.82,-0.38,-0.29,-0.01,-0.03,-0.01,-0.08,-0.01,-0.03,-0.02,-0.03,-0.36,-0.5,-0.53,-0.05,-0.14,-0.48,-0.5,-0.37,-0.3,-0.19,-0.12,-0.02,-0.03,-0.26,-0.61,-0.61,-0.31,-0.32,-0.39,-0.4,-0.39,-0.36,-0.33,-0.21,-0.3,-0.18,-0.02,-0.02
25%,-0.26,-0.43,-0.0,-0.04,-0.05,-0.01,-0.02,-0.06,-0.12,-0.19,-0.35,-0.43,-0.35,-0.16,-0.03,-0.43,-0.05,-0.02,-0.01,-0.12,-0.12,-0.44,-0.44,-0.33,-0.36,-0.09,-0.01,-0.04,-0.01,-0.18,-0.26,-0.02,-0.02,-0.03,-0.04,-0.03,-0.08,-0.32,-0.31,-0.27,-0.01,-0.1,-0.14,-0.08,-0.13,-0.36,-0.33,-0.11,-0.06,-0.04,-0.01,-0.37,-0.38,-0.36,-0.38,-0.39,-0.38,-0.34,-0.36,-0.15,-0.1,-0.41,-0.17,-0.22,-0.16,-0.15,-0.43,-0.39,-0.33,-0.29,-0.56,-0.75,-0.26,-0.42,-0.42,-0.45,-0.41,-0.16,-0.92,-0.9,-0.45,0.0,-0.44,-0.0,-0.36,-0.02,-0.42,-0.09,-0.33,-0.08,-0.32,-0.04,-0.08,-0.15,-0.25,-0.38,-0.44,-0.33,-0.16,-0.1,-0.03,-0.0,-0.08,-0.42,-0.42,-0.28,-0.21,-0.33,-0.22,-0.4,-0.47,-0.22,-0.03,-0.0,-0.16,-0.23,-0.8,-0.56,-0.03,-0.07,-0.03,-0.16,-0.8,-0.37,-0.25,-0.01,-0.03,-0.01,-0.08,-0.01,-0.03,-0.02,-0.03,-0.36,-0.5,-0.53,-0.05,-0.14,-0.48,-0.5,-0.37,-0.3,-0.19,-0.12,-0.02,-0.03,-0.26,-0.57,-0.57,-0.3,-0.31,-0.39,-0.4,-0.38,-0.36,-0.33,-0.21,-0.3,-0.18,-0.02,-0.02
50%,-0.13,-0.43,-0.0,-0.04,-0.05,-0.01,-0.02,-0.06,-0.12,-0.19,-0.3,-0.19,-0.27,-0.15,-0.03,-0.22,-0.05,-0.02,-0.01,-0.12,-0.12,-0.22,-0.22,-0.23,-0.23,-0.09,-0.01,-0.04,-0.01,-0.16,-0.22,-0.02,-0.02,-0.03,-0.04,-0.03,-0.08,-0.28,-0.24,-0.24,-0.01,-0.08,-0.12,-0.07,-0.13,-0.34,-0.26,-0.11,-0.06,-0.04,-0.01,-0.21,-0.21,-0.21,-0.21,-0.21,-0.22,-0.23,-0.35,-0.15,-0.1,-0.22,-0.16,-0.21,-0.14,-0.14,-0.22,-0.23,-0.23,-0.22,0.01,-0.35,-0.14,-0.23,-0.23,-0.28,-0.19,-0.16,-0.28,-0.33,-0.28,0.0,-0.42,-0.0,-0.16,-0.02,-0.21,-0.09,-0.23,-0.08,-0.31,-0.04,-0.08,-0.15,-0.23,-0.24,-0.21,-0.25,-0.14,-0.09,-0.03,-0.0,-0.07,-0.23,-0.23,-0.17,-0.19,-0.23,-0.18,-0.2,-0.26,-0.14,-0.02,-0.0,-0.09,-0.17,-0.39,-0.52,-0.03,-0.07,-0.03,-0.16,-0.4,-0.25,-0.19,-0.01,-0.03,-0.01,-0.08,-0.01,-0.03,-0.02,-0.03,-0.22,-0.28,-0.31,-0.05,-0.14,-0.27,-0.28,-0.35,-0.28,-0.19,-0.12,-0.02,-0.03,-0.25,-0.36,-0.34,-0.21,-0.19,-0.2,-0.2,-0.2,-0.2,-0.24,-0.18,-0.29,-0.18,-0.02,-0.02
75%,-0.05,-0.43,-0.0,-0.04,-0.05,-0.01,-0.02,-0.06,-0.12,-0.17,-0.08,0.05,0.07,-0.03,-0.03,-0.08,-0.05,-0.02,-0.01,-0.12,-0.12,-0.07,-0.08,-0.12,-0.08,-0.09,-0.01,-0.04,-0.01,-0.06,-0.05,-0.02,-0.02,-0.03,-0.04,-0.03,-0.05,0.05,-0.05,-0.12,-0.01,-0.06,-0.08,-0.05,-0.06,0.06,-0.07,-0.11,-0.06,-0.04,-0.01,-0.06,-0.06,-0.08,-0.07,-0.04,-0.03,-0.02,0.04,-0.14,-0.1,-0.09,-0.12,-0.1,-0.08,-0.1,-0.08,-0.09,-0.1,-0.12,0.21,0.48,-0.06,-0.09,-0.09,-0.16,-0.05,-0.13,0.65,0.8,-0.16,0.0,0.16,-0.0,0.39,-0.02,-0.08,-0.09,-0.09,-0.07,0.08,-0.04,-0.08,-0.14,-0.14,-0.05,0.04,0.03,-0.08,-0.08,-0.03,-0.0,-0.07,-0.09,0.0,-0.05,-0.13,-0.07,-0.13,-0.06,0.13,0.07,-0.0,-0.0,0.09,0.24,1.18,1.18,-0.03,-0.04,-0.03,0.03,1.12,-0.1,-0.06,-0.01,-0.03,-0.01,-0.08,-0.01,-0.03,-0.02,-0.03,-0.1,0.12,0.1,-0.05,-0.14,-0.05,-0.03,0.03,0.01,-0.19,-0.12,-0.02,-0.03,-0.1,-0.0,0.03,-0.08,-0.06,-0.03,-0.01,-0.0,-0.02,-0.07,-0.11,-0.01,-0.14,-0.02,-0.02
max,183.89,2.31,238.74,127.76,97.0,160.61,181.76,102.34,83.07,87.71,50.27,23.06,43.54,75.47,131.63,14.78,95.71,97.66,125.05,62.68,58.9,16.5,18.77,34.05,20.91,64.63,196.71,88.57,195.44,117.69,79.95,113.17,132.24,69.86,61.74,96.15,91.04,33.56,92.31,73.74,183.32,130.38,125.4,136.49,67.7,30.3,67.49,70.41,133.47,126.08,196.97,56.82,55.46,43.48,45.27,42.59,34.91,28.48,28.1,118.6,114.8,17.72,100.21,63.79,87.71,97.73,14.82,19.03,34.14,31.01,4.82,11.18,182.83,17.69,17.69,38.58,17.08,62.2,2.22,2.17,39.17,0.0,34.25,238.74,60.06,66.21,15.45,47.51,24.1,29.27,55.23,150.22,137.54,71.09,66.03,46.27,36.06,54.64,78.94,48.41,131.69,238.74,69.33,17.69,76.23,152.28,49.32,52.03,66.95,62.52,62.6,211.28,158.2,238.73,180.43,106.12,12.96,24.53,109.03,83.52,120.12,149.38,12.88,40.75,53.92,228.46,110.33,163.16,53.47,164.88,85.21,87.18,75.11,27.42,36.41,35.08,64.96,36.6,19.31,20.17,38.93,57.43,28.21,61.62,144.58,160.93,96.54,25.08,21.1,60.89,36.28,62.55,65.07,77.39,47.69,37.75,68.53,36.28,98.22,126.8,154.3


#### 5) Class imbalance: oversampling using SMOTE

#### 6) Feature dim. reduction: PCA
`feature weight 확인 원한다면 해제 필요`

`return 값: X, y`