In [65]:
# 1. 필요한 라이브러리 불러오기
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
# 연속형 데이터와 범주형 데이터 전처리
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pickle

In [66]:
# 데이터 불러오기
letal_car = pd.read_csv('../../datasets/LetalCarOfContractType.csv')
letal_car.head(3)

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,렌탈,Normal,서비스 방문,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,9.0,새마을금고,정상,43.0,1862.0
1,66755948,렌탈,Extension_Rental,서비스 방문,2019-10-20,60,카드이체,K1,102900,계약확정,0,없음,2.0,현대카드,정상,62.0,2532.0
2,66756657,렌탈,Normal,홈쇼핑/방송,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,8.0,우리은행,정상,60.0,2363.0


In [67]:
letal_car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51304 entries, 0 to 51303
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 51304 non-null  int64  
 1   type_of_contract   51300 non-null  object 
 2   type_of_contract2  51303 non-null  object 
 3   channel            51304 non-null  object 
 4   datetime           51304 non-null  object 
 5   Term               51304 non-null  int64  
 6   payment_type       51304 non-null  object 
 7   product            51303 non-null  object 
 8   amount             51304 non-null  int64  
 9   state              51304 non-null  object 
 10  overdue_count      51304 non-null  int64  
 11  overdue            51302 non-null  object 
 12  credit rating      42521 non-null  float64
 13  bank               48544 non-null  object 
 14  cancellation       51279 non-null  object 
 15  age                40509 non-null  float64
 16  Mileage            405

In [68]:
# 결측치 확인
letal_car[['id','type_of_contract','type_of_contract2','channel'
           ,'datetime','Term','payment_type','product','amount','state'
           ,'overdue_count','overdue','credit rating','bank'
           ,'cancellation','age','Mileage']].isna().sum()

id                       0
type_of_contract         4
type_of_contract2        1
channel                  0
datetime                 0
Term                     0
payment_type             0
product                  1
amount                   0
state                    0
overdue_count            0
overdue                  2
credit rating         8783
bank                  2760
cancellation            25
age                  10795
Mileage              10795
dtype: int64

In [69]:
# 결측치 처리

# object형은 최빈값으로
letal_car['type_of_contract'] = letal_car['type_of_contract'].fillna(letal_car['type_of_contract'].mode()[0])
letal_car['type_of_contract2'] = letal_car['type_of_contract2'].fillna(letal_car['type_of_contract2'].mode()[0])
letal_car['product'] = letal_car['product'].fillna(letal_car['product'].mode()[0])
letal_car['overdue'] = letal_car['overdue'].fillna(letal_car['overdue'].mode()[0])
letal_car['bank'] = letal_car['bank'].fillna(letal_car['bank'].mode()[0])
letal_car['cancellation'] = letal_car['cancellation'].fillna(letal_car['cancellation'].mode()[0])

# float형은 평균값으로
letal_car['age'] = letal_car['age'].fillna(letal_car['age'].mean())
letal_car['Mileage'] = letal_car['Mileage'].fillna(letal_car['Mileage'].mean())
letal_car['credit rating'] = letal_car['credit rating'].fillna(letal_car['credit rating'].mean())

# 결측치 처리시에는 반드시 새 데이터프레임에 넣거나 inplace = True 를 할 것

In [70]:
# 결측치 처리후 다시확인
letal_car.isnull().sum()

id                   0
type_of_contract     0
type_of_contract2    0
channel              0
datetime             0
Term                 0
payment_type         0
product              0
amount               0
state                0
overdue_count        0
overdue              0
credit rating        0
bank                 0
cancellation         0
age                  0
Mileage              0
dtype: int64

In [71]:
df_letal_car = pd.DataFrame(letal_car)

In [72]:
# 연속형과 범주형 데이터를 구분하는 함수
def split_category_columns(df_letal_car):
    continuous_columns = []
    categorical_columns = []
    
    for column in df_letal_car.columns:
        # 데이터 타입 확인
        if df_letal_car[column].dtype in ['int64','float64']:
            unique_ratio = df_letal_car[column].nunique() / len(df_letal_car) # 고유값 비율 계산
            
            if unique_ratio < 0.05 : # 고유값 비율 5% 미만시 범주형으로 간주
                categorical_columns.append(column)
            else : 
                continuous_columns.append(column)
        else : 
            categorical_columns.append(column)
    
    return continuous_columns, categorical_columns

continuous, categorical = split_category_columns(letal_car)

In [73]:
# 연속형 확인
continuous_data = letal_car[continuous].select_dtypes(include = ['int64','float64'])
continuous_data = continuous_data.fillna(0)  # 결측값을 0으로 대체 (필요에 따라 평균값 사용 가능)

In [74]:
# 범주형 확인
categorical_data = letal_car[categorical]

In [75]:
# 언속형 데이터 스케일링
standardScaler = StandardScaler()
standardScaler.fit_transform(continuous_data)
standardScaler