In [8]:
from sklearn.preprocessing import  OneHotEncoder, LabelEncoder
import numpy as np
import pandas as pd

# 결측치 대체, Label_encoding등 컬럼에 대한 전처리 클래스
class Preprocessing:
    # 컬럼들 안의 이상 값들
    na_values = ['$', '#VALUE!', '##', 'XNA', '@', '#', 'x', '&']
    # float데이터 인데, object로 되어있는 이상 컬럼들
    columns = ['Client_Income', 'Credit_Amount', 'Loan_Annuity', 'Score_Source_3', 'Population_Region_Relative', 'Age_Days', 'Employed_Days', 'Registration_Days', 'ID_Days']

    def __init__(self):
        onehot_encoder = OneHotEncoder()
        label_encoder = LabelEncoder()

        # csv파일 불러오는 과정 na_values안에 포함된 것들은 결측치로 대체
        self.__pay_df = pd.read_csv('./nbfi_vehicle_loan_repayment_dataset/Train_Dataset.csv', na_values=self.na_values, encoding='utf-8', engine='python')
        # columns에 object로 되어있는 컬럼들을 float으로 변경
        for column in self.columns:
            self.__pay_df[column] = pd.to_numeric(self.__pay_df[column], errors='coerce')
        # target data 분리
        self.__y_target = self.__pay_df['Default']
        del self.__pay_df['Default']

    # 결측치를 어떻게 할 것인가
    def missing_value(self, what='mean'):
        # object인 column들(카테고리)만 뽑기
        category_columns = self.__pay_df.select_dtypes(include='object').columns
        # object가 아닌 column들(연속형)만 뽑기
        numerical_value = self.__pay_df.select_dtypes(exclude='object').columns
        
        if what == 'mean':
            # 연속형 데이터의 평균값으로 결측치 대체
            for column in numerical_value:
                self.__pay_df[column] = self.__pay_df[column].fillna(self.__pay_df[column].mean())
        # TODO:추가적으로 대체할 것들 추가하기!   
        elif what == 'None':
            pass


    # data를 리턴하는 함수
    def get_df(self):
        return self.__pay_df
    
    # target_df를 리턴하는 함수
    def get_target_df(self):
        return self.__y_target

In [9]:
pp = Preprocessing()
pp.missing_value()  # 결측치 대체 수행
df = pp.get_df()    # 전처리가 완료된 데이터셋을 얻음
target_df = pp.get_target_df()  # 전처리가 완료된 타겟 변수 데이터셋을 얻음


In [11]:
df

Unnamed: 0,ID,Client_Income,Car_Owned,Bike_Owned,Active_Loan,House_Own,Child_Count,Credit_Amount,Loan_Annuity,Accompany_Client,...,Application_Process_Hour,Client_Permanent_Match_Tag,Client_Contact_Work_Tag,Type_Organization,Score_Source_1,Score_Source_2,Score_Source_3,Social_Circle_Default,Phone_Change,Credit_Bureau
0,12142509,6750.0,0.0,0.0,1.000000,0.00000,0.0,61190.55,3416.85,Alone,...,17.0000,Yes,Yes,Self-employed,0.568066,0.478787,0.511180,0.018600,63.000000,1.891082
1,12138936,20250.0,1.0,0.0,1.000000,0.69206,0.0,15282.00,1826.55,Alone,...,10.0000,Yes,Yes,Government,0.563360,0.215068,0.511180,0.117428,962.106056,1.891082
2,12181264,18000.0,0.0,0.0,1.000000,0.00000,1.0,59527.35,2788.20,Alone,...,12.0631,Yes,Yes,Self-employed,0.501213,0.552795,0.329655,0.074200,277.000000,0.000000
3,12188929,15750.0,0.0,0.0,1.000000,1.00000,0.0,53870.40,2295.45,Alone,...,15.0000,Yes,Yes,,0.501213,0.135182,0.631355,0.117428,1700.000000,3.000000
4,12133385,33750.0,1.0,0.0,1.000000,0.00000,2.0,133988.40,3547.35,Alone,...,12.0631,Yes,Yes,Business Entity Type 3,0.508199,0.301182,0.355639,0.202100,674.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121851,12207714,29250.0,0.0,0.0,0.499175,1.00000,0.0,107820.00,3165.30,Relative,...,16.0000,Yes,No,Business Entity Type 2,0.501213,0.173527,0.184116,0.057700,0.000000,1.000000
121852,12173765,15750.0,0.0,1.0,1.000000,0.00000,0.0,104256.00,3388.05,Alone,...,12.0000,Yes,Yes,Self-employed,0.501213,0.371559,0.406617,0.082500,4.000000,0.000000
121853,12103937,8100.0,0.0,1.0,0.000000,1.00000,1.0,55107.90,2989.35,Alone,...,11.0000,No,No,Trade: type 6,0.169049,0.048079,0.511180,0.117428,0.000000,1.891082
121854,12170623,38250.0,1.0,1.0,0.000000,1.00000,0.0,45000.00,2719.35,Alone,...,12.0000,Yes,Yes,Business Entity Type 3,0.182737,0.103538,0.077499,0.097900,0.000000,2.000000


In [10]:
df.isna().sum()

ID                                0
Client_Income                     0
Car_Owned                         0
Bike_Owned                        0
Active_Loan                       0
House_Own                         0
Child_Count                       0
Credit_Amount                     0
Loan_Annuity                      0
Accompany_Client               1758
Client_Income_Type             3701
Client_Education               3645
Client_Marital_Status          3473
Client_Gender                  2416
Loan_Contract_Type             3651
Client_Housing_Type            3687
Population_Region_Relative        0
Age_Days                          0
Employed_Days                     0
Registration_Days                 0
ID_Days                           0
Own_House_Age                     0
Mobile_Tag                        0
Homephone_Tag                     0
Workphone_Working                 0
Client_Occupation             41435
Client_Family_Members             0
Cleint_City_Rating          