<a href="https://colab.research.google.com/github/jsjj10002/KeggelDaconProject/blob/main/RandomizedSearchCV%EB%A5%BC_%EC%9D%B4%EC%9A%A9%ED%95%9C_%EC%9E%90%EB%8F%99%EC%B0%A8_%EB%B3%B4%ED%97%98%EC%82%AC%EA%B8%B0_%ED%83%90%EC%A7%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 개발 환경: Ubuntu 20.04
# 라이브러리 버전:
# pandas: 1.2.4
# numpy: 1.20.3
# scikit-learn: 0.24.2
# imbalanced-learn: 0.8.0
# matplotlib: 3.4.2
# seaborn: 0.11.1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

# 데이터 로드
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# 결측값 처리 (중앙값으로 대체)
train['claim_est_payout'] = train['claim_est_payout'].fillna(train['claim_est_payout'].median())
train['age_of_vehicle'] = train['age_of_vehicle'].fillna(train['age_of_vehicle'].median())

test['claim_est_payout'] = test['claim_est_payout'].fillna(test['claim_est_payout'].median())
test['age_of_vehicle'] = test['age_of_vehicle'].fillna(test['age_of_vehicle'].median())

# 새로운 특징 생성
train['income_to_claim_ratio'] = train['annual_income'] / (train['claim_est_payout'] + 1)
test['income_to_claim_ratio'] = test['annual_income'] / (test['claim_est_payout'] + 1)

train['driver_age_to_vehicle_age_ratio'] = train['age_of_driver'] / (train['age_of_vehicle'] + 1)
test['driver_age_to_vehicle_age_ratio'] = test['age_of_driver'] / (test['age_of_vehicle'] + 1)

# 'fraud'와 'ID' 열을 제거하여 X_train 생성
X_train = train.drop(['fraud', 'ID'], axis=1)
y = train['fraud']

# 'ID' 열만 제거하여 X_test 생성
X_test = test.drop('ID', axis=1)

# 범주형 데이터를 숫자형으로 인코딩 (원-핫 인코딩 사용)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 데이터 불균형 문제 해결 (SMOTE 사용)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y)

# 학습 데이터와 검증 데이터 분리
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# RandomForestClassifier 모델 학습
model_smote = RandomForestClassifier(n_estimators=100, random_state=42)
model_smote.fit(X_train_split, y_train_split)

# 검증 데이터로 예측
y_val_pred_smote = model_smote.predict(X_val_split)

# 평가 지표 출력
accuracy_smote = accuracy_score(y_val_split, y_val_pred_smote)
recall_smote = recall_score(y_val_split, y_val_pred_smote)
precision_smote = precision_score(y_val_split, y_val_pred_smote)
f1_smote = f1_score(y_val_split, y_val_pred_smote)
classification_rep_smote = classification_report(y_val_split, y_val_pred_smote)

print(f"Accuracy: {accuracy_smote}")
print(f"Recall: {recall_smote}")
print(f"Precision: {precision_smote}")
print(f"F1 Score: {f1_smote}")
print("Classification Report:\n", classification_rep_smote)

Accuracy: 0.9235474006116208
Recall: 0.863932898415657
Precision: 0.9825119236883942
F1 Score: 0.9194148276717083
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.98      0.93      2105
           1       0.98      0.86      0.92      2146

    accuracy                           0.92      4251
   macro avg       0.93      0.92      0.92      4251
weighted avg       0.93      0.92      0.92      4251



In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from scipy.stats import randint
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 데이터 로드
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 결측값 처리 (중앙값으로 대체)
train['claim_est_payout'] = train['claim_est_payout'].fillna(train['claim_est_payout'].median())
train['age_of_vehicle'] = train['age_of_vehicle'].fillna(train['age_of_vehicle'].median())

# 새로운 특징 생성
train['income_to_claim_ratio'] = train['annual_income'] / (train['claim_est_payout'] + 1)
train['driver_age_to_vehicle_age_ratio'] = train['age_of_driver'] / (train['age_of_vehicle'] + 1)

# 'fraud'와 'ID' 열을 제거하여 X_train 생성
X_train = train.drop(['fraud', 'ID'], axis=1)
y = train['fraud']

# 'ID' 열만 제거하여 X_test 생성
test_ids = test['ID']
X_test = test.drop('ID', axis=1)

# 범주형 특징 선택
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

# 전처리 파이프라인 생성
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 학습 데이터 전처리 및 변환
X_train_transformed = preprocessor.fit_transform(X_train)

# PCA 적용 (설명된 분산의 95%를 유지)
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_transformed)

# 데이터 불균형 문제 해결 (SMOTE 사용)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_pca, y)

# 학습 데이터와 검증 데이터 분리
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 하이퍼파라미터 튜닝 (RandomizedSearchCV 사용)
param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=30,  # 시도할 횟수
                                   cv=3,
                                   n_jobs=-1,
                                   verbose=2,
                                   random_state=42)
random_search.fit(X_train_split, y_train_split)

# 최적의 하이퍼파라미터로 모델 학습 및 평가
best_model = random_search.best_estimator_
y_val_pred_best = best_model.predict(X_val_split)

# 평가 지표 출력
accuracy_best = accuracy_score(y_val_split, y_val_pred_best)
recall_best = recall_score(y_val_split, y_val_pred_best)
precision_best = precision_score(y_val_split, y_val_pred_best)
f1_best = f1_score(y_val_split, y_val_pred_best)
classification_rep_best = classification_report(y_val_split, y_val_pred_best)

print(f"Accuracy: {accuracy_best}")
print(f"Recall: {recall_best}")
print(f"Precision: {precision_best}")
print(f"F1 Score: {f1_best}")
print("Classification Report:\n", classification_rep_best)




Fitting 3 folds for each of 30 candidates, totalling 90 fits
Accuracy: 0.9306045636320865
Recall: 0.9277726001863933
Precision: 0.9343031440638198
F1 Score: 0.9310264203881226
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      2105
           1       0.93      0.93      0.93      2146

    accuracy                           0.93      4251
   macro avg       0.93      0.93      0.93      4251
weighted avg       0.93      0.93      0.93      4251



KeyError: "['income_to_claim_ratio', 'driver_age_to_vehicle_age_ratio'] not in index"

In [None]:
test['claim_est_payout'] = test['claim_est_payout'].fillna(test['claim_est_payout'].median())
test['age_of_vehicle'] = test['age_of_vehicle'].fillna(test['age_of_vehicle'].median())
test['income_to_claim_ratio'] = test['annual_income'] / (test['claim_est_payout'] + 1)
test['driver_age_to_vehicle_age_ratio'] = test['age_of_driver'] / (test['age_of_vehicle'] + 1)

# 테스트 데이터 전처리 (훈련 데이터와 동일하게)
X_test_transformed = preprocessor.transform(X_test)
X_test_pca = pca.transform(X_test_transformed)

# 테스트 데이터에 대한 예측 수행
y_test_pred = best_model.predict(X_test_pca)

# submission 파일 생성
submission = pd.DataFrame({
    'ID': test_ids,
    'fraud': y_test_pred
})

submission.to_csv('/mnt/data/submission.csv', index=False)
print("submission.csv 파일이 생성되었습니다.")


KeyError: "['income_to_claim_ratio', 'driver_age_to_vehicle_age_ratio'] not in index"

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from scipy.stats import randint
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 데이터 로드
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# 결측값 처리 (중앙값으로 대체)
train['claim_est_payout'] = train['claim_est_payout'].fillna(train['claim_est_payout'].median())
train['age_of_vehicle'] = train['age_of_vehicle'].fillna(train['age_of_vehicle'].median())
test['claim_est_payout'] = test['claim_est_payout'].fillna(test['claim_est_payout'].median())
test['age_of_vehicle'] = test['age_of_vehicle'].fillna(test['age_of_vehicle'].median())

# 새로운 특징 생성 (학습 데이터와 테스트 데이터 모두에 동일하게 적용)
train['income_to_claim_ratio'] = train['annual_income'] / (train['claim_est_payout'] + 1)
train['driver_age_to_vehicle_age_ratio'] = train['age_of_driver'] / (train['age_of_vehicle'] + 1)
test['income_to_claim_ratio'] = test['annual_income'] / (test['claim_est_payout'] + 1)
test['driver_age_to_vehicle_age_ratio'] = test['age_of_driver'] / (test['age_of_vehicle'] + 1)

# 'fraud'와 'ID' 열을 제거하여 X_train 생성
X_train = train.drop(['fraud', 'ID'], axis=1)
y = train['fraud']

# 'ID' 열만 제거하여 X_test 생성
test_ids = test['ID']
X_test = test.drop('ID', axis=1)

# 범주형 특징 선택
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

# 전처리 파이프라인 생성
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 학습 데이터 전처리 및 변환
X_train_transformed = preprocessor.fit_transform(X_train)

# PCA 적용 (설명된 분산의 95%를 유지)
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_transformed)

# 데이터 불균형 문제 해결 (SMOTE 사용)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_pca, y)

# 학습 데이터와 검증 데이터 분리
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 하이퍼파라미터 튜닝 (RandomizedSearchCV 사용)
param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=30,  # 시도할 횟수
                                   cv=3,
                                   n_jobs=-1,
                                   verbose=2,
                                   random_state=42)
random_search.fit(X_train_split, y_train_split)

# 최적의 하이퍼파라미터로 모델 학습 및 평가
best_model = random_search.best_estimator_
y_val_pred_best = best_model.predict(X_val_split)

# 평가 지표 출력
accuracy_best = accuracy_score(y_val_split, y_val_pred_best)
recall_best = recall_score(y_val_split, y_val_pred_best)
precision_best = precision_score(y_val_split, y_val_pred_best)
f1_best = f1_score(y_val_split, y_val_pred_best)
classification_rep_best = classification_report(y_val_split, y_val_pred_best)

print(f"Accuracy: {accuracy_best}")
print(f"Recall: {recall_best}")
print(f"Precision: {precision_best}")
print(f"F1 Score: {f1_best}")
print("Classification Report:\n", classification_rep_best)



Fitting 3 folds for each of 30 candidates, totalling 90 fits
Accuracy: 0.9306045636320865
Recall: 0.9277726001863933
Precision: 0.9343031440638198
F1 Score: 0.9310264203881226
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      2105
           1       0.93      0.93      0.93      2146

    accuracy                           0.93      4251
   macro avg       0.93      0.93      0.93      4251
weighted avg       0.93      0.93      0.93      4251



OSError: Cannot save file into a non-existent directory: '/mnt/data'

In [None]:

# 테스트 데이터 전처리 (훈련 데이터와 동일하게)
X_test_transformed = preprocessor.transform(X_test)
X_test_pca = pca.transform(X_test_transformed)

# 테스트 데이터에 대한 예측 수행
y_test_pred = best_model.predict(X_test_pca)

# submission 파일 생성
submission = pd.DataFrame({
    'ID': test_ids,
    'fraud': y_test_pred
})

submission.to_csv('submission.csv', index=False)
print("submission.csv 파일이 생성되었습니다.")

submission.csv 파일이 생성되었습니다.
