# Import  
파이썬에서 다른 모듈이나 패키지에서 함수, 클래스 등을 가져옵니다.

In [1]:
!pip install koreanize-matplotlib
import koreanize_matplotlib

Collecting koreanize-matplotlib
  Downloading koreanize_matplotlib-0.1.1-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: koreanize-matplotlib
Successfully installed koreanize-matplotlib-0.1.1


In [2]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.preprocessing import RobustScaler, MinMaxScaler,StandardScaler
#from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [3]:
from sklearn.model_selection import train_test_split

#### 시드(seed) 고정  
매번 고정된 결과를 얻기 위해서 사용합니다.  
시드를 고정하지 않는다면 같은 코드라도 매번 다른 결과가 나올 수 있습니다.

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

#### 데이터 읽어오기 및 데이터 확인

In [10]:
# pd.read_csv() 함수를 사용해서 데이터를 읽어오는 코드입니다.
train = pd.read_csv('train.csv')

# 데이터를 확인하기 위해 head() 함수를 사용합니다.
train.head()

Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2


#### 독립변수(x_train), 종속변수(y_train)로 분리하기  


In [11]:
x_train = train.drop(['ID', 'TARGET'], axis = 1)
y_train = train['TARGET']


In [12]:
x_train.head()

Unnamed: 0,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지
0,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도
1,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도
2,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도
3,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지
4,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소


#### train , validation data 나누기

In [13]:
X_tr, X_val, Y_tr, Y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=42)

#### 라벨인코딩(Label Encoding)

In [14]:

ordinal_features = ['요일', '범죄발생지']

for feature in ordinal_features:
    le = LabelEncoder()

    # Fit and transform on the training data
    le = le.fit(X_tr[feature])
    X_tr[feature] = le.transform(X_tr[feature])

    # Transform the test data (Note: Only transform, do not fit again to avoid data leakage)
    X_val[feature] = le.transform(X_val[feature])



### XGBClassifier

In [15]:
xgb_model= XGBClassifier(random_state=42)
xgb_model.fit(X_tr, Y_tr)

y_pred = xgb_model.predict(X_val)


#### 예측

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

acc=accuracy_score(Y_val,y_pred)
print(f'정확도 : {acc:.4f}')

# 정밀도 (precision)
precision=precision_score(Y_val,y_pred, average='weighted')
print(f'정말도 : {precision:.4f}')

recall = recall_score(Y_val, y_pred, average='weighted')
print(f'Recall: {recall:.4f}')

# F1 점수 (F1 Score)
f1 = f1_score(Y_val, y_pred, average='weighted')
print(f'F1 Score: {f1:.4f}')

# 혼동 행렬 (Confusion Matrix)
conf_matrix = confusion_matrix(Y_val, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

정확도 : 0.5500
정말도 : 0.5466
Recall: 0.5500
F1 Score: 0.5393
Confusion Matrix:
[[7975 1365 1579]
 [3004 3414 1224]
 [3172 1051 2538]]


In [None]:
from sklearn.feature_selection import RFE

In [None]:
import time

In [None]:
for n_features in range(1,17):
    # RFE를 사용하여 특성 선택
    rfe = RFE(estimator=xgb_model, n_features_to_select=n_features)
    X_train_rfe = rfe.fit_transform(X_tr, Y_tr)
    X_test_rfe = rfe.transform(X_val)

    # Measure training time
    start_time = time.time()

    # 모델 훈련 및 예측
    xgb_model.fit(X_train_rfe, Y_tr)
    y_pred_xgb = xgb_model.predict(X_test_rfe)

    end_time = time.time()
    training_time = end_time - start_time

    # Get selected features
    selected_features = X_tr.columns[rfe.support_]

    # 정확도 출력, 학습 시간 출력, 선택된 특성 출력
    accuracy_xgb = accuracy_score(Y_val, y_pred_xgb)
    print(f'n_features_to_select={n_features}, Accuracy: {accuracy_xgb:.3f}, Training Time: {training_time:.3f} seconds, Selected Features: {selected_features}')

n_features_to_select=1, Accuracy: 0.527, Training Time: 1.139 seconds, Selected Features: Index(['범죄발생지'], dtype='object')
n_features_to_select=2, Accuracy: 0.554, Training Time: 1.620 seconds, Selected Features: Index(['소관경찰서', '범죄발생지'], dtype='object')
n_features_to_select=3, Accuracy: 0.556, Training Time: 1.696 seconds, Selected Features: Index(['소관경찰서', '소관지역', '범죄발생지'], dtype='object')
n_features_to_select=4, Accuracy: 0.553, Training Time: 1.807 seconds, Selected Features: Index(['소관경찰서', '소관지역', '사건발생거리', '범죄발생지'], dtype='object')
n_features_to_select=5, Accuracy: 0.550, Training Time: 1.913 seconds, Selected Features: Index(['요일', '소관경찰서', '소관지역', '사건발생거리', '범죄발생지'], dtype='object')
n_features_to_select=6, Accuracy: 0.552, Training Time: 1.963 seconds, Selected Features: Index(['월', '요일', '소관경찰서', '소관지역', '사건발생거리', '범죄발생지'], dtype='object')
n_features_to_select=7, Accuracy: 0.550, Training Time: 2.017 seconds, Selected Features: Index(['월', '요일', '소관경찰서', '소관지역', '사건발생거리', '진눈

### GridSearch

In [None]:
'''param_grid = {
    'n_features_to_select': range(1, X_tr.shape[1] + 1),
    'n_estimators': [50, 100],
    'max_depth': [3, 4, 5],
}'''

param_grid = {

    'n_estimators' : [100,150],
    'learning_rate' : [0.01,0.05,0.1,0.15],
    'max_depth' : [3,5,7,10],
    'gamma' : [0,1,2,3],
    'colsample_bytree' : [0.8,0.9],


}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy')

# 그리드 서치 수행
grid_search.fit(X_tr, Y_tr)

# 최적의 하이퍼파라미터 출력
print("Best Parameters:", grid_search.best_params_)

# 최적의 모델 추출
best_model = grid_search.best_estimator_

# 테스트 데이터에 대한 예측
y_pred = best_model.predict(X_val)

# 정확도 출력
accuracy = accuracy_score(Y_val, y_pred)
print("Accuracy:", accuracy)

Best Parameters: {'colsample_bytree': 0.8, 'gamma': 2, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 150}
Accuracy: 0.5548139957349341
