In [65]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE



# 결측치 대체, Label_encoding등 컬럼에 대한 전처리 클래스
class Preprocessing:
    # 컬럼들 안의 이상 값들
    na_values = ['$', '#VALUE!', '##', 'XNA', '@', '#', 'x', '&']
    # float데이터 인데, object로 되어있는 이상 컬럼들
    columns = ['Client_Income', 'Credit_Amount', 'Loan_Annuity', 'Score_Source_3', 'Population_Region_Relative', 'Age_Days', 'Employed_Days', 'Registration_Days', 'ID_Days']

    def __init__(self):
        # csv파일 불러오는 과정 na_values안에 포함된 것들은 결측치로 대체
        self.__pay_df = pd.read_csv('./nbfi_vehicle_loan_repayment_dataset/Train_Dataset.csv', na_values=self.na_values, encoding='utf-8', engine='python')

        # columns에 object로 되어있는 컬럼들을 float으로 변경
        for column in self.columns:
            self.__pay_df[column] = pd.to_numeric(self.__pay_df[column], errors='coerce')
        # target data 분리
        self.__y_target = self.__pay_df['Default']
        del self.__pay_df['Default']

        # 특정 컬럼에 대한 결측치 처리
        self.__pay_df['Client_Occupation'].fillna('Nojob', inplace=True)
        self.__pay_df['Credit_Bureau'].fillna(self.__pay_df['Credit_Bureau'].mean(), inplace=True)

    def drop_columns(self):
        # 삭제할 column들
        drop_columns = ['ID', 'Own_House_Age', 'Type_Organization', 'Mobile_Tag', 'Score_Source_1', 'Score_Source_3', 'Social_Circle_Default', 'Application_Process_Hour', 'Accompany_Client', 'Client_Income']
        self.__pay_df.drop(columns=drop_columns, axis=1, inplace=True)

    def category_columns_replace(self):
        # object인 column들(카테고리)만 뽑기
        category_columns_object = [
            'Client_Income_Type', 'Client_Education', 'Client_Marital_Status', 
            'Client_Gender', 'Loan_Contract_Type', 'Client_Housing_Type', 
            'Client_Occupation']

        # 결측치 0개 초과 10000개 미만의 데이터를 대상으로 랜덤하게 결측치 대체
        for column in category_columns_object:
            if self.__pay_df[column].isna().sum() > 10000 or self.__pay_df[column].isna().sum() == 0:
                continue
            unique_columns = self.__pay_df[column].loc[self.__pay_df[column].isna()==False].unique()
            self.__pay_df[column] = self.__pay_df[column].apply(lambda x : random.choice(unique_columns) if pd.isna(x) else x)

        category_columns = [
            'Car_Owned', 'Bike_Owned', 'Active_Loan', 
            'House_Own', 'Homephone_Tag', 'Workphone_Working', 
            'Cleint_City_Rating', 'Application_Process_Day', 'Client_Permanent_Match_Tag', 
            'Client_Contact_Work_Tag']

        # 나머지 범주형 데이터에 대해 one-hot encoding 적용
        df_null_sum = self.__pay_df.isna().sum()
        column = df_null_sum[df_null_sum>0].index
        self.__pay_df = pd.get_dummies(self.__pay_df, columns=category_columns)

    # numerical_columns에 대해서 결측치를 어떻게 대체할 것인지        
    def numerical_columns_replace(self):
        # numerical_column들
        numerical_columns=[
            'Child_Count', 'Credit_Amount', 'Loan_Annuity', 
            'Population_Region_Relative', 'Age_Days', 'Employed_Days', 
            'Registration_Days', 'ID_Days', 'Client_Family_Members',
            'Score_Source_2', 'Phone_Change', 'Credit_Bureau']

        for column in numerical_columns:
            self.__pay_df[column] = self.__pay_df[column].fillna(self.__pay_df[column].mean())

    # data를 리턴하는 함수
    def get_df(self):
        return self.__pay_df
    
    # target_df를 리턴하는 함수
    def get_target_df(self):
        return self.__y_target

In [66]:
preprocessing_db = Preprocessing()
# 결측치 제거
preprocessing_db.drop_columns()
# 연속형 데이터 대체
preprocessing_db.numerical_columns_replace()
# 범주형 데이터 대체
preprocessing_db.category_columns_replace()

In [67]:
df = preprocessing_db.get_df()

In [68]:
target_df = preprocessing_db.get_target_df()

In [69]:
train_x, test_x, train_y, test_y = train_test_split(df, target_df, test_size=0.3, random_state=42) 

In [70]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, accuracy_score, accuracy_score
# from sklearn.model_selection import GridSearchCV

In [71]:
y_train

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
156811,0.0,1.0
156812,0.0,1.0
156813,0.0,1.0
156814,0.0,1.0


In [75]:
model = xgb.XGBClassifier()

x_train = pd.read_csv('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/x_train_over_df.csv')

y_train = pd.read_csv('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/y_train_over_df.csv')

x_test = pd.read_csv('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/test_data_df.csv')

y_test = pd.read_csv('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/test_label_df.csv')


model.fit(x_train, y_train)



ValueError: y should be a 1d array, got an array of shape (156816, 2) instead.

In [73]:
import xgboost as xgb

# Load the trained XGBClassifier model
model = xgb.XGBClassifier()


# Load the test data
x_test = pd.read_csv('C:/Users/TECH2_25/semi_project/nbfi_vehicle_loan_repayment_dataset/test_data_df.csv')

# Make predictions on the test data
y_pred = model.predict(x_test)

# Print the predicted labels
print(y_pred)


NotFittedError: need to call fit or load_model beforehand

In [19]:
print(df.dtypes)

Child_Count                       float64
Credit_Amount                     float64
Loan_Annuity                      float64
Client_Income_Type                 object
Client_Education                   object
Client_Marital_Status              object
Client_Gender                      object
Loan_Contract_Type                 object
Client_Housing_Type                object
Population_Region_Relative        float64
Age_Days                          float64
Employed_Days                     float64
Registration_Days                 float64
ID_Days                           float64
Client_Occupation                  object
Client_Family_Members             float64
Score_Source_2                    float64
Phone_Change                      float64
Credit_Bureau                     float64
Car_Owned_0.0                       uint8
Car_Owned_1.0                       uint8
Bike_Owned_0.0                      uint8
Bike_Owned_1.0                      uint8
Active_Loan_0.0                   

In [11]:
model = xgb.XGBClassifier()





ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:Client_Income_Type, Client_Education, Client_Marital_Status, Client_Gender, Loan_Contract_Type, Client_Housing_Type, Client_Occupation