<a href="https://colab.research.google.com/github/jwengr/dacon/blob/main/%EC%8B%A0%EC%9A%A9%EC%B9%B4%EB%93%9C%20%EC%82%AC%EC%9A%A9%EC%9E%90%20%EC%97%B0%EC%B2%B4%20%EC%98%88%EC%B8%A1%20AI%20%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/Preprocessing1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [232]:
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.manifold import TSNE

import pandas as pd
import numpy as np
import pickle
import os, sys
import matplotlib.pyplot as plt
import scipy.stats as stats

In [188]:
PATH = 'drive/My Drive/dacon/credit'

In [189]:
train_df = pd.read_csv(PATH+'/dataset/train.csv')

## Make Function

In [245]:
def log(df):
    result = np.log(np.abs(df)+1)
    return pd.DataFrame(result,columns=df.columns)

In [191]:
def minmax(df):
    result = MinMaxScaler().fit_transform(df)
    return pd.DataFrame(result,columns=df.columns)

In [192]:
def t(df,n):
    result = np.reshape(stats.t(n).pdf(df),(-1,1))
    return pd.DataFrame(result,columns=df.columns)

In [193]:
def robust(df,i,j):
    result = RobustScaler(quantile_range=(i, j)).fit_transform(df)
    return pd.DataFrame(result,columns=df.columns)

In [205]:
def count(df):
    d = {key :(df.iloc[:,0]==key).sum() for key in set(df.iloc[:,0].values)}
    result = df.iloc[:,0].apply(lambda x: d[x]).values
    return pd.DataFrame(result,columns=df.columns)

In [207]:
def group(df,n):
    d = {key :(df.iloc[:,0]==key).sum() for key in set(df.iloc[:,0].values)}
    d_items = sorted(d.items(), key=lambda x: -x[1])
    key = [d_items[i][0] for i in range(len(d_items))]
    val = [d_items[i][0] for i in range(n)]
    val = val + [val[-1]]*(len(d_items)-n)
    d = dict(zip(key,val))
    result = df.iloc[:,0].apply(lambda x: d[x]).values
    return pd.DataFrame(result,columns=df.columns)

In [209]:
def select(df,n):
    d = {key :(df.iloc[:,0]==key).sum() for key in set(df.iloc[:,0].values)}
    d_items = sorted(d.items(), key=lambda x: -x[1])
    key = [d_items[i][0] for i in range(len(d_items))]
    val = [d_items[i][0] for i in range(n)]
    val = val + ['other']*(len(d_items)-n)
    d = dict(zip(key,val))
    result = df.iloc[:,0].apply(lambda x: d[x]).values
    return pd.DataFrame(result,columns=df.columns)

In [211]:
def fillnafreq(df,n):
    result = df.iloc[:,0].fillna(df.iloc[:,0].value_counts().index[n-1]).values
    return pd.DataFrame(result,columns=df.columns)

In [219]:
def tostr(df):
    result = df.iloc[:,0].apply(lambda x: f'str_{x}').values
    return pd.DataFrame(result,columns=df.columns)

In [225]:
def onehotenc(df):
    result = pd.get_dummies(df.iloc[:,0],prefix=df.columns[0])
    return result

## Make Pipeline

In [246]:
func_dict = {
    'log':log,
    'minmax':minmax,
    't':t,
    'robust':robust,
    'count':count,
    'group':group,
    'select':select,
    'fillnafreq':fillnafreq,
    'tostr':tostr,
    'onehotenc':onehotenc
}

In [238]:
def preprocess(col,pipeline):
    name = ','.join([col]+['-'.join([str(pp) for pp in p]) for p in pipeline])
    result = train_df[[col]]
    for pipe in pipeline:
        func, *kwargs = pipe
        result = func_dict[func](result,*kwargs)
    if not os.path.exists(f'{PATH}/preproc/{col}') : os.mkdir(f'{PATH}/preproc/{col}')
    result.to_csv(f'{PATH}/preproc/{col}/{name}.csv')
    return result

In [262]:
cols = ['gender','car','reality','work_phone','phone','email']
for col in cols:
    preprocess(col,[])

In [263]:
col = 'child_num'
pipelines = [
                [
                    ['log'],
                ],
                [
                    ['minmax'],
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)
for i in range(100,74,-5):
    preprocess(col,[['robust',0,i]])

In [264]:
col = 'income_total'
pipelines = [
                [
                    ['log'],
                ],
                [
                    ['minmax'],
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)
for i in range(5):
    for j in range(100,74,-5):
        preprocess(col,[['robust',i,j]])
for i in [2,3,4,5,7,9,11,13,15,20,25,30]:
    preprocess(col,[['t',i]])

In [265]:
col = 'DAYS_BIRTH'
pipelines = [
                [
                    ['log'],
                ],
                [
                    ['minmax'],
                ],
                [],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)

In [266]:
col = 'DAYS_EMPLOYED'
pipelines = [
                [
                    ['minmax'],
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)
for i in range(5):
    for j in range(100,74,-5):
        preprocess(col,[['robust',i,j]])
for i in [2,3,4,5,7,9,11,13,15,20,25,30]:
    preprocess(col,[['t',i]])

In [267]:
col = 'income_type'
pipelines = [
                [],
                [
                    ['group',4]
                ],
                [
                    ['group',3]
                ],
                [
                    ['group',2]
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)

In [268]:
col = 'edu_type'
pipelines = [
                [],
                [
                    ['group',4]
                ],
                [
                    ['group',3]
                ],
                [
                    ['group',2]
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)

In [269]:
col = 'family_type'
pipelines = [
                [],
                [
                    ['group',4]
                ],
                [
                    ['group',3]
                ],
                [
                    ['group',2]
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)

In [270]:
col = 'house_type'
pipelines = [
                [],
                [
                    ['group',4]
                ],
                [
                    ['group',3]
                ],
                [
                    ['group',2]
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)

In [271]:
col = 'family_size'
pipelines = [   [
                    ['tostr']
                ],
                [
                    ['tostr'],['group',2]
                ],
                [
                    ['tostr'],['group',3]
                ],
                [
                    ['tostr'],['group',4]
                ],
                [
                    ['tostr'],['group',5]
                ],
                [
                    ['tostr'],['group',6]
                ],
                [
                    ['tostr'],['group',7]
                ],
                [
                    ['tostr'],['group',8]
                ],
                [
                    ['tostr'],['group',9]
                ],
            ]
for pipeline in pipelines:
    preprocess(col,pipeline)
for j in range(100,74,-5):
    preprocess(col,[['robust',0,j]])
for i in [2,3,4,5,7,9,11,13,15,20,25,30]:
    preprocess(col,[['t',i]])

## None Type(occpy_type)

## Seasonal Data(begin_month)