In [28]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE



# 결측치 대체, Label_encoding등 컬럼에 대한 전처리 클래스
class Preprocessing:
    # 컬럼들 안의 이상 값들
    na_values = ['$', '#VALUE!', '##', 'XNA', '@', '#', 'x', '&']
    # float데이터 인데, object로 되어있는 이상 컬럼들
    columns = ['Client_Income', 'Credit_Amount', 'Loan_Annuity', 'Score_Source_3', 'Population_Region_Relative', 'Age_Days', 'Employed_Days', 'Registration_Days', 'ID_Days']

    def __init__(self):
        # csv파일 불러오는 과정 na_values안에 포함된 것들은 결측치로 대체
        self.__pay_df = pd.read_csv('./nbfi_vehicle_loan_repayment_dataset/Train_Dataset.csv', na_values=self.na_values, encoding='utf-8', engine='python')

        # columns에 object로 되어있는 컬럼들을 float으로 변경
        for column in self.columns:
            self.__pay_df[column] = pd.to_numeric(self.__pay_df[column], errors='coerce')
        # target data 분리
        self.__y_target = self.__pay_df['Default']
        del self.__pay_df['Default']
        # 특정 컬럼에 대한 결측치 처리
        self.__pay_df['Client_Occupation'].fillna('Nojob', inplace=True)
        self.__pay_df['Credit_Bureau'].fillna(self.__pay_df['Credit_Bureau'].mean(), inplace=True)


    def drop_columns(self):
        # 삭제할 column들
        drop_columns = ['Own_House_Age', 'Type_Organization', 'Mobile_Tag', 'Score_Source_1', 'Score_Source_3', 'Social_Circle_Default']
        self.__pay_df.drop(columns=drop_columns, axis=1, inplace=True)

    def category_columns_replace(self):
        # object인 column들(카테고리)만 뽑기
        category_columns_object = self.__pay_df.select_dtypes(include='object').columns

        # 결측치 0개 초과 10000개 미만의 데이터를 대상으로 랜덤하게 결측치 대체
        for column in category_columns_object:
            if self.__pay_df[column].isna().sum() > 10000 or self.__pay_df[column].isna().sum() == 0:
                continue
            unique_columns = self.__pay_df[column].loc[self.__pay_df[column].isna()==False].unique()
            self.__pay_df[column] = self.__pay_df[column].apply(lambda x : random.choice(unique_columns) if pd.isna(x) else x)

        # 나머지 범주형 데이터에 대해 one-hot encoding 적용
        df_null_sum = self.__pay_df.isna().sum()
        column = df_null_sum[df_null_sum>0].index
        self.__pay_df = pd.get_dummies(self.__pay_df, columns=column)


    # numerical_columns에 대해서 결측치를 어떻게 대체할 것인지        
    def numerical_columns_replace(self):
        # numerical_column들
        numerical_columns = ['Client_Income', 'Credit_Amount', 'Loan_Annuity', 
                   'Population_Region_Relative', 'Age_Days', 'Employed_Days',
                   'Registration_Days', 'ID_Days',
                   'Score_Source_2', 'Phone_Change']

        for column in numerical_columns:
            self.__pay_df[column] = self.__pay_df[column].fillna(self.__pay_df[column].mean())

    def train_split(self):
        train_x, test_x, train_y, test_y = train_test_split(test_size=0.3, random_state=42, stratify=self.__y_target)
        smote = SMOTE(random_state=42)
        X_train_over, y_train_over = smote.fit_sample(X_train, y_train)

    # data를 리턴하는 함수
    def get_df(self):
        return self.__pay_df
    
    # target_df를 리턴하는 함수
    def get_target_df(self):
        return self.__y_target

In [29]:
preprocessing_db = Preprocessing()
# 결측치 제거
preprocessing_db.drop_columns()
# 연속형 데이터 대체
preprocessing_db.numerical_columns_replace()
# 범주형 데이터 대체ㅐ
preprocessing_db.category_columns_replace()

In [30]:
df = preprocessing_db.get_df()

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121856 entries, 0 to 121855
Data columns (total 94 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Client_Income                  121856 non-null  float64
 1   Credit_Amount                  121856 non-null  float64
 2   Loan_Annuity                   121856 non-null  float64
 3   Accompany_Client               121856 non-null  object 
 4   Client_Income_Type             121856 non-null  object 
 5   Client_Education               121856 non-null  object 
 6   Client_Marital_Status          121856 non-null  object 
 7   Client_Gender                  121856 non-null  object 
 8   Loan_Contract_Type             121856 non-null  object 
 9   Client_Housing_Type            121856 non-null  object 
 10  Population_Region_Relative     121856 non-null  float64
 11  Age_Days                       121856 non-null  float64
 12  Employed_Days                 

In [32]:
df.drop(columns = 'ID', axis = 1, inplace = True)

In [45]:
df

Unnamed: 0,Client_Income,Credit_Amount,Loan_Annuity,Accompany_Client,Client_Income_Type,Client_Education,Client_Marital_Status,Client_Gender,Loan_Contract_Type,Client_Housing_Type,...,Application_Process_Hour_14.0,Application_Process_Hour_15.0,Application_Process_Hour_16.0,Application_Process_Hour_17.0,Application_Process_Hour_18.0,Application_Process_Hour_19.0,Application_Process_Hour_20.0,Application_Process_Hour_21.0,Application_Process_Hour_22.0,Application_Process_Hour_23.0
0,6750.0,61190.55,3416.85,Alone,Commercial,Secondary,M,Male,CL,Home,...,0,0,0,1,0,0,0,0,0,0
1,20250.0,15282.00,1826.55,Alone,Service,Graduation,M,Male,CL,Home,...,0,0,0,0,0,0,0,0,0,0
2,18000.0,59527.35,2788.20,Alone,Service,Graduation dropout,W,Male,CL,Family,...,0,0,0,0,0,0,0,0,0,0
3,15750.0,53870.40,2295.45,Alone,Retired,Secondary,M,Male,CL,Home,...,0,1,0,0,0,0,0,0,0,0
4,33750.0,133988.40,3547.35,Alone,Commercial,Secondary,M,Female,CL,Home,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121851,29250.0,107820.00,3165.30,Relative,Service,Secondary,M,Female,CL,Home,...,0,0,1,0,0,0,0,0,0,0
121852,15750.0,104256.00,3388.05,Alone,Commercial,Graduation,M,Female,CL,Home,...,0,0,0,0,0,0,0,0,0,0
121853,8100.0,55107.90,2989.35,Alone,Govt Job,Secondary,M,Male,CL,Home,...,0,0,0,0,0,0,0,0,0,0
121854,38250.0,45000.00,2719.35,Alone,Service,Graduation,M,Female,CL,Home,...,0,0,0,0,0,0,0,0,0,0


In [34]:
target_df = preprocessing_db.get_target_df()

In [35]:
target_df

0         0
1         0
2         0
3         0
4         0
         ..
121851    1
121852    0
121853    0
121854    0
121855    0
Name: Default, Length: 121856, dtype: int64

In [36]:
train_x, test_x, train_y, test_y = train_test_split(df, target_df, test_size=0.3, random_state=42, stratify=target_df)  
# smote = SMOTE(random_state=42)
# X_train_over, y_train_over = smote.fit_sample(train_x, train_y)


In [37]:
train_y

51123     0
113910    0
2278      0
18058     0
91556     0
         ..
93313     0
24114     0
14180     0
105783    0
83907     0
Name: Default, Length: 85299, dtype: int64

In [38]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import GridSearchCV

In [39]:
model = xgb.XGBClassifier()


x_train = pd.read_pickle('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/x_train_over.pickle')

y_train = pd.read_pickle('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/y_train_over.pickle')

x_test = pd.read_pickle('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/x_test.pickle')

y_test = pd.read_pickle('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/y_test.pickle')

model.fit(x_train, y_train)




ValueError: y should be a 1d array, got an array of shape (156816, 2) instead.

In [44]:
y_test.shape

(36557, 2)

In [24]:
y_train.shape

(156816, 2)

In [22]:
from sklearn.metrics import accuracy_score

# 모델 예측
y_pred = model.predict(x_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)

# 결과 출력
print("Accuracy: ", accuracy)
print("Mean squared error: ", mse)


Accuracy:  0.917963727877014
Mean squared error:  0.08203627212298602


In [24]:
from sklearn.metrics import recall_score

# 모델 예측
y_pred = model.predict(x_test)

# 모델 평가
recall_micro = recall_score(y_test, y_pred, average='micro')

mse = mean_squared_error(y_test, y_pred)

# 결과 출력
print("Recall (micro): ", recall_micro)
print("Mean squared error: ", mse)


Recall (micro):  0.917963727877014
Mean squared error:  0.08203627212298602
