In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor

In [69]:
import pandas as pd
df_rental = pd.read_csv('/apps/study_machinelearning/datasets/LetalCarOfContractType.csv')
df_rental.head(3)

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,렌탈,Normal,서비스 방문,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,9.0,새마을금고,정상,43.0,1862.0
1,66755948,렌탈,Extension_Rental,서비스 방문,2019-10-20,60,카드이체,K1,102900,계약확정,0,없음,2.0,현대카드,정상,62.0,2532.0
2,66756657,렌탈,Normal,홈쇼핑/방송,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,8.0,우리은행,정상,60.0,2363.0


In [70]:
df_rental.dropna(inplace=True)

In [71]:
continuous_columns = df_rental.select_dtypes(include=['number']).columns.tolist()
categorical_columns = df_rental.select_dtypes(exclude=['number']).columns.tolist()

print(f'연속형 컬럼: {continuous_columns}')
print(f'범주형 컬럼: {categorical_columns}')

연속형 컬럼: ['id', 'Term', 'amount', 'overdue_count', 'credit rating', 'age', 'Mileage']
범주형 컬럼: ['type_of_contract', 'type_of_contract2', 'channel', 'datetime', 'payment_type', 'product', 'state', 'overdue', 'bank', 'cancellation']


- 연속형 :age
- 범주형 : product

In [72]:
# 연속형 컬럼과 범주형 컬럼 분리
categorical_cols = df_rental.select_dtypes(include=['object']).columns

In [73]:
# 범주형 변수 One-Hot Encoding 적용
data_encoded = pd.get_dummies(df_rental, columns=categorical_cols, drop_first=True)

In [74]:
# 레이블 선택 (예: 'amount' 컬럼을 레이블로 사용)
label_column = 'age'
X = data_encoded.drop(label_column, axis=1)
y = data_encoded[label_column]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
from sklearn.preprocessing import StandardScaler
# 스케일링 적용 (선형 회귀에만)
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_scaled)

In [77]:
regression_models = {'Linear Regression': LinearRegression(),
                    'Random Forest Regressor': RandomForestRegressor(random_state=42),
                    'LightGBM Regressor': LGBMRegressor(random_state=42)
}

In [82]:
def get_best_model(X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled):
    # 모델 리스트
    models = [
        ('Linear Regression', LinearRegression()),
        ('Random Forest', RandomForestRegressor(random_state=42)),
        ('LGBM Regressor', LGBMRegressor(random_state=42))
    ]
    
    best_model = None
    best_score = -float('inf')  # R² 값은 0~1 사이이므로, 초기값은 매우 작은 값으로 설정
    
    for model_name, model in models:
        # 모델 학습
        model.fit(X_train, y_train)
        
        # 예측 및 평가
        y_pred = model.predict(X_test)
        score = r2_score(y_test, y_pred)
        
        print(f'{model_name} R² Score: {score}')
        
        # 최상의 모델 업데이트
        if score > best_score:
            best_score = score
            best_model = model_name
            
    return best_model, best_score

# 함수 호출
best_model, best_score = get_best_model(X_train, X_test, y_train, y_test)
print(f'\n가장 성능 좋은 모델: {best_model} R² Score: {best_score}')

Linear Regression R² Score: 0.9471481046278312
Random Forest R² Score: 0.9438036753152009
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 887
[LightGBM] [Info] Number of data points in the train set: 32384, number of used features: 156
[LightGBM] [Info] Start training from score 46.615613
LGBM Regressor R² Score: 0.9468830668095602

가장 성능 좋은 모델: Linear Regression R² Score: 0.9471481046278312


In [79]:
# Linear Regression 모델 실행 및 평가
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
print(f'Linear Regression R² Score: {score}')

Linear Regression R² Score: 0.9471481046278312


In [80]:

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
print(f'Linear Regression R² Score: {score}')

Linear Regression R² Score: 0.9438036753152009


In [81]:
# Linear Regression 모델 실행 및 평가
model = LGBMRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
print(f'Linear Regression R² Score: {score}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 887
[LightGBM] [Info] Number of data points in the train set: 32384, number of used features: 156
[LightGBM] [Info] Start training from score 46.615613
Linear Regression R² Score: 0.9468830668095602
