In [65]:
import pandas as pd
import scipy.stats as stats
df_rental = pd.read_csv('/apps/study_machinelearning/datasets/LetalCarOfContractType.csv')
df_rental.head(3)

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,렌탈,Normal,서비스 방문,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,9.0,새마을금고,정상,43.0,1862.0
1,66755948,렌탈,Extension_Rental,서비스 방문,2019-10-20,60,카드이체,K1,102900,계약확정,0,없음,2.0,현대카드,정상,62.0,2532.0
2,66756657,렌탈,Normal,홈쇼핑/방송,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,8.0,우리은행,정상,60.0,2363.0


In [66]:
df_rental.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51304 entries, 0 to 51303
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 51304 non-null  int64  
 1   type_of_contract   51300 non-null  object 
 2   type_of_contract2  51303 non-null  object 
 3   channel            51304 non-null  object 
 4   datetime           51304 non-null  object 
 5   Term               51304 non-null  int64  
 6   payment_type       51304 non-null  object 
 7   product            51303 non-null  object 
 8   amount             51304 non-null  int64  
 9   state              51304 non-null  object 
 10  overdue_count      51304 non-null  int64  
 11  overdue            51302 non-null  object 
 12  credit rating      42521 non-null  float64
 13  bank               48544 non-null  object 
 14  cancellation       51279 non-null  object 
 15  age                40509 non-null  float64
 16  Mileage            405

In [67]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [68]:
# 범주형 변수를 수치형으로 변환 (LabelEncoder 사용)
label_columns = df_rental.select_dtypes(include=[object]).columns  # 범주형 변수 선택

label_encoder = LabelEncoder()


In [69]:
# 범주형 변수에 LabelEncoder 적용
for col in label_columns:
    df_rental[col] = label_encoder.fit_transform(df_rental[col])


In [70]:
# 수치형 변수만 선택
numerical_cols = df_rental.select_dtypes(include=[np.number]).columns

In [71]:
train_data_age = df_rental[df_rental['age'].notnull()]
X_train_age = train_data_age.drop(columns=['age', 'Mileage'])  # 'age', 'Mileage' 제외
y_train_age = train_data_age['age']

train_data_mileage = df_rental[df_rental['Mileage'].notnull()]
X_train_mileage = train_data_mileage.drop(columns=['age', 'Mileage'])  # 'age', 'Mileage' 제외
y_train_mileage = train_data_mileage['Mileage']


In [72]:
# 모델 학습 (age 예측 모델)
model_age = RandomForestRegressor(random_state=42)
model_age.fit(X_train_age, y_train_age)

# 모델 학습 (Mileage 예측 모델)
model_mileage = RandomForestRegressor(random_state=42)
model_mileage.fit(X_train_mileage, y_train_mileage)



In [73]:
# 'age' 결측치가 있는 데이터 추출
test_data_age = df_rental[df_rental['age'].isnull()]
X_test_age = test_data_age.drop(columns=['age', 'Mileage'])

# 'Mileage' 결측치가 있는 데이터 추출
test_data_mileage = df_rental[df_rental['Mileage'].isnull()]
X_test_mileage = test_data_mileage.drop(columns=['age', 'Mileage'])

In [74]:
# 결측치를 대체하기 위한 SimpleImputer 사용
imputer = SimpleImputer(strategy='mean')

# 결측치가 있는 데이터를 평균으로 대체 (age 데이터)
X_test_age_imputed = imputer.fit_transform(X_test_age)

# 결측치가 있는 데이터를 평균으로 대체 (Mileage 데이터)
X_test_mileage_imputed = imputer.fit_transform(X_test_mileage)


In [75]:
# 모델을 사용하여 결측치 예측 (age 예측)
predicted_age = model_age.predict(X_test_age_imputed)

# 모델을 사용하여 결측치 예측 (Mileage 예측)
predicted_mileage = model_mileage.predict(X_test_mileage_imputed)





In [76]:
# 예측된 값으로 결측치 채우기
df_rental.loc[df_rental['age'].isnull(), 'age'] = predicted_age
df_rental.loc[df_rental['Mileage'].isnull(), 'Mileage'] = predicted_mileage

In [77]:
# 결과 확인
df_rental.head(3)

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,0,2,11,0,60,0,0,96900,0,0,0,9.0,22,0,43.0,1862.0
1,66755948,0,0,11,0,60,4,0,102900,0,0,0,2.0,45,0,62.0,2532.0
2,66756657,0,2,17,0,60,0,0,96900,0,0,0,8.0,33,0,60.0,2363.0


In [89]:
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

# 'Mileage' 결측치가 없는 데이터 추출
test_data_mileage_no_nan = df_rental[df_rental['Mileage'].notnull()]
X_test_mileage_no_nan = test_data_mileage_no_nan.drop(columns=['age', 'Mileage'])
y_test_mileage_no_nan = test_data_mileage_no_nan['Mileage']

# SimpleImputer를 사용하여 결측치 처리
imputer = SimpleImputer(strategy='mean')

# 'age' 예측용 데이터에서 결측치 처리
X_test_age_no_nan_imputed = imputer.fit_transform(X_test_age_no_nan)

# 'Mileage' 예측용 데이터에서 결측치 처리
X_test_mileage_no_nan_imputed = imputer.fit_transform(X_test_mileage_no_nan)

# 예측값 계산
predicted_age_no_nan = model_age.predict(X_test_age_no_nan_imputed)
predicted_mileage_no_nan = model_mileage.predict(X_test_mileage_no_nan_imputed)

# R² 계산
r2_age = r2_score(y_test_age_no_nan, predicted_age_no_nan)
r2_mileage = r2_score(y_test_mileage_no_nan, predicted_mileage_no_nan)

# 결과 출력
print(f'Age 모델의 R²: {r2_age}')
print(f'Mileage 모델의 R²: {r2_mileage}')




Age 모델의 R²: 0.8471778705297612
Mileage 모델의 R²: 0.8463746892241851


1에 가까울수록 모델이 실제 데이터를 잘 설명한다는 의미

In [90]:
#