In [2]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

import xgboost as xgb
from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [4]:
# pd.read_csv() 함수를 사용해서 데이터를 읽어오는 코드입니다.
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# 데이터를 확인하기 위해 head() 함수를 사용합니다.
train.head(5)

Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2


In [5]:
X = train.drop(['ID', 'TARGET'], axis=1)
y = train.TARGET

test = test.drop('ID', axis = 1)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)

In [8]:
ordinal_features = ['요일', '범죄발생지']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(X_train[feature])
    X_train[feature] = le.transform(X_train[feature])
    
    # X_train데이터에서 존재하지 않았던 값이 x_test 데이터에 존재할 수도 있습니다.
    # 따라서 x_test 데이터를 바로 변형시키지 않고 고윳값을 확인후 x_test 데이터를 변환합니다.

    for label in np.unique(X_val[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)

    X_val[feature] = le.transform(X_val[feature])

  if label not in le.classes_:


In [10]:
X_val

Unnamed: 0,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지
21701,3,8,1,2436,35.0,1.989238,0.000000,0.00,4.333333,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
55660,8,10,5,36,46.0,2.148711,38.750000,0.00,0.000000,200.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,6
24572,6,13,7,740,20.0,1.895063,0.000000,0.00,0.000000,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
1078,2,13,3,728,17.0,3.722139,14.500000,62.25,350.800000,350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5
36360,2,11,10,1546,42.0,4.824061,3.714286,0.00,81.500000,260.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83136,1,11,3,1137,31.0,1.268302,0.000000,0.00,10.000000,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
56497,5,11,3,850,7.0,0.519685,0.625000,0.00,0.000000,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
51530,8,8,4,1346,5.0,0.533557,0.000000,0.00,0.000000,200.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7
75793,5,13,3,1928,52.0,2.440780,1.000000,0.00,0.000000,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7


In [11]:
# 모델 인자에 random_state를 넣음으로써 시드고정의 효과를 얻을 수 있습니다.

dt = DecisionTreeClassifier(random_state = 42)
xgb = xgb.XGBClassifier(n_estimators = 400, 
                              eval_metric='auc', 
                              booster='gbtree', 
                              random_state = 42, 
                              n_jobs=-1)

In [12]:
sampler = TPESampler()

In [14]:
def objective(trial):
    
    param = {
        'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'learning_rate': trial.suggest_float('learning_rate',0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 4,8),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 50),
    }
    
    model = XGBClassifier(**param)  
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds=100)
    
    preds = model.predict(X_val)
    
    macro_f1score = f1_score(y_val, preds, average='macro')
    return macro_f1score
          
study_xgb = optuna.create_study(
    direction='maximize',
    study_name = 'Xgboost Optuna', 
    sampler=sampler
)

study_xgb.optimize(objective, n_trials=50)

[32m[I 2023-05-19 19:12:49,492][0m A new study created in memory with name: Xgboost Optuna[0m


[0]	validation_0-mlogloss:1.08879




[1]	validation_0-mlogloss:1.07603
[2]	validation_0-mlogloss:1.06631
[3]	validation_0-mlogloss:1.05570
[4]	validation_0-mlogloss:1.04966
[5]	validation_0-mlogloss:1.04113
[6]	validation_0-mlogloss:1.03429
[7]	validation_0-mlogloss:1.02810
[8]	validation_0-mlogloss:1.02023
[9]	validation_0-mlogloss:1.01342
[10]	validation_0-mlogloss:1.00985
[11]	validation_0-mlogloss:1.00703
[12]	validation_0-mlogloss:1.00237
[13]	validation_0-mlogloss:0.99753
[14]	validation_0-mlogloss:0.99292
[15]	validation_0-mlogloss:0.99009
[16]	validation_0-mlogloss:0.98633
[17]	validation_0-mlogloss:0.98481
[18]	validation_0-mlogloss:0.98267
[19]	validation_0-mlogloss:0.98144
[20]	validation_0-mlogloss:0.98005
[21]	validation_0-mlogloss:0.97733
[22]	validation_0-mlogloss:0.97650
[23]	validation_0-mlogloss:0.97447
[24]	validation_0-mlogloss:0.97289
[25]	validation_0-mlogloss:0.97136
[26]	validation_0-mlogloss:0.97010
[27]	validation_0-mlogloss:0.96869
[28]	validation_0-mlogloss:0.96758
[29]	validation_0-mlogloss:0.

[231]	validation_0-mlogloss:0.95046
[232]	validation_0-mlogloss:0.95043
[233]	validation_0-mlogloss:0.95048
[234]	validation_0-mlogloss:0.95050
[235]	validation_0-mlogloss:0.95049
[236]	validation_0-mlogloss:0.95052
[237]	validation_0-mlogloss:0.95053
[238]	validation_0-mlogloss:0.95053
[239]	validation_0-mlogloss:0.95051
[240]	validation_0-mlogloss:0.95048
[241]	validation_0-mlogloss:0.95051
[242]	validation_0-mlogloss:0.95047
[243]	validation_0-mlogloss:0.95053
[244]	validation_0-mlogloss:0.95058
[245]	validation_0-mlogloss:0.95059
[246]	validation_0-mlogloss:0.95056
[247]	validation_0-mlogloss:0.95054
[248]	validation_0-mlogloss:0.95051
[249]	validation_0-mlogloss:0.95055
[250]	validation_0-mlogloss:0.95058
[251]	validation_0-mlogloss:0.95062
[252]	validation_0-mlogloss:0.95064
[253]	validation_0-mlogloss:0.95065
[254]	validation_0-mlogloss:0.95068
[255]	validation_0-mlogloss:0.95070
[256]	validation_0-mlogloss:0.95075
[257]	validation_0-mlogloss:0.95073
[258]	validation_0-mlogloss:

[32m[I 2023-05-19 19:13:17,554][0m Trial 0 finished with value: 0.526578454241279 and parameters: {'lambda': 0.06334074686696524, 'alpha': 0.4472983631939705, 'colsample_bytree': 0.6515137216226677, 'subsample': 0.5983158169071018, 'learning_rate': 0.08431301424004893, 'n_estimators': 488, 'max_depth': 6, 'min_child_weight': 38}. Best is trial 0 with value: 0.526578454241279.[0m


[0]	validation_0-mlogloss:1.08957
[1]	validation_0-mlogloss:1.07546
[2]	validation_0-mlogloss:1.06532
[3]	validation_0-mlogloss:1.05415
[4]	validation_0-mlogloss:1.04692
[5]	validation_0-mlogloss:1.03872
[6]	validation_0-mlogloss:1.03463
[7]	validation_0-mlogloss:1.02771
[8]	validation_0-mlogloss:1.01958
[9]	validation_0-mlogloss:1.01483
[10]	validation_0-mlogloss:1.01076
[11]	validation_0-mlogloss:1.00784
[12]	validation_0-mlogloss:1.00456
[13]	validation_0-mlogloss:1.00096
[14]	validation_0-mlogloss:0.99523
[15]	validation_0-mlogloss:0.99210
[16]	validation_0-mlogloss:0.98752
[17]	validation_0-mlogloss:0.98616
[18]	validation_0-mlogloss:0.98377
[19]	validation_0-mlogloss:0.98218
[20]	validation_0-mlogloss:0.98060
[21]	validation_0-mlogloss:0.97747
[22]	validation_0-mlogloss:0.97619
[23]	validation_0-mlogloss:0.97470
[24]	validation_0-mlogloss:0.97342
[25]	validation_0-mlogloss:0.97124
[26]	validation_0-mlogloss:0.96981
[27]	validation_0-mlogloss:0.96817
[28]	validation_0-mlogloss:0.9

[32m[I 2023-05-19 19:13:41,664][0m Trial 1 finished with value: 0.5276626134220787 and parameters: {'lambda': 0.06427880959517926, 'alpha': 0.6911718566397123, 'colsample_bytree': 0.507595812602577, 'subsample': 0.5385889905403728, 'learning_rate': 0.08932247968183718, 'n_estimators': 220, 'max_depth': 8, 'min_child_weight': 10}. Best is trial 1 with value: 0.5276626134220787.[0m


[0]	validation_0-mlogloss:1.08680
[1]	validation_0-mlogloss:1.07226
[2]	validation_0-mlogloss:1.06112
[3]	validation_0-mlogloss:1.04935
[4]	validation_0-mlogloss:1.04222
[5]	validation_0-mlogloss:1.03327
[6]	validation_0-mlogloss:1.02567
[7]	validation_0-mlogloss:1.01900
[8]	validation_0-mlogloss:1.01099
[9]	validation_0-mlogloss:1.00408
[10]	validation_0-mlogloss:1.00031
[11]	validation_0-mlogloss:0.99741
[12]	validation_0-mlogloss:0.99278
[13]	validation_0-mlogloss:0.98785
[14]	validation_0-mlogloss:0.98338
[15]	validation_0-mlogloss:0.98022
[16]	validation_0-mlogloss:0.97684
[17]	validation_0-mlogloss:0.97535
[18]	validation_0-mlogloss:0.97343
[19]	validation_0-mlogloss:0.97150
[20]	validation_0-mlogloss:0.97029
[21]	validation_0-mlogloss:0.96808
[22]	validation_0-mlogloss:0.96618
[23]	validation_0-mlogloss:0.96437
[24]	validation_0-mlogloss:0.96345
[25]	validation_0-mlogloss:0.96185
[26]	validation_0-mlogloss:0.96087
[27]	validation_0-mlogloss:0.95972
[28]	validation_0-mlogloss:0.9

[32m[I 2023-05-19 19:14:09,781][0m Trial 2 finished with value: 0.527495279554406 and parameters: {'lambda': 0.07511747619450496, 'alpha': 0.5474850645815792, 'colsample_bytree': 0.6673638998274016, 'subsample': 0.8712639255283133, 'learning_rate': 0.09484803490232159, 'n_estimators': 857, 'max_depth': 8, 'min_child_weight': 25}. Best is trial 1 with value: 0.5276626134220787.[0m


[0]	validation_0-mlogloss:1.09442
[1]	validation_0-mlogloss:1.08844
[2]	validation_0-mlogloss:1.08364
[3]	validation_0-mlogloss:1.07816
[4]	validation_0-mlogloss:1.07490
[5]	validation_0-mlogloss:1.07020
[6]	validation_0-mlogloss:1.06694
[7]	validation_0-mlogloss:1.06316
[8]	validation_0-mlogloss:1.05815
[9]	validation_0-mlogloss:1.05492
[10]	validation_0-mlogloss:1.05215
[11]	validation_0-mlogloss:1.04964
[12]	validation_0-mlogloss:1.04578
[13]	validation_0-mlogloss:1.04151
[14]	validation_0-mlogloss:1.03755
[15]	validation_0-mlogloss:1.03488
[16]	validation_0-mlogloss:1.03122
[17]	validation_0-mlogloss:1.02994
[18]	validation_0-mlogloss:1.02756
[19]	validation_0-mlogloss:1.02574
[20]	validation_0-mlogloss:1.02409
[21]	validation_0-mlogloss:1.02090
[22]	validation_0-mlogloss:1.01952
[23]	validation_0-mlogloss:1.01699
[24]	validation_0-mlogloss:1.01475
[25]	validation_0-mlogloss:1.01244
[26]	validation_0-mlogloss:1.01070
[27]	validation_0-mlogloss:1.00854
[28]	validation_0-mlogloss:1.0

[231]	validation_0-mlogloss:0.95227
[232]	validation_0-mlogloss:0.95224
[233]	validation_0-mlogloss:0.95223
[234]	validation_0-mlogloss:0.95221
[235]	validation_0-mlogloss:0.95220
[236]	validation_0-mlogloss:0.95219
[237]	validation_0-mlogloss:0.95216
[238]	validation_0-mlogloss:0.95216
[239]	validation_0-mlogloss:0.95216
[240]	validation_0-mlogloss:0.95214
[241]	validation_0-mlogloss:0.95212
[242]	validation_0-mlogloss:0.95210
[243]	validation_0-mlogloss:0.95208
[244]	validation_0-mlogloss:0.95210
[245]	validation_0-mlogloss:0.95211
[246]	validation_0-mlogloss:0.95209
[247]	validation_0-mlogloss:0.95205
[248]	validation_0-mlogloss:0.95204
[249]	validation_0-mlogloss:0.95202
[250]	validation_0-mlogloss:0.95199
[251]	validation_0-mlogloss:0.95196
[252]	validation_0-mlogloss:0.95195
[253]	validation_0-mlogloss:0.95192
[254]	validation_0-mlogloss:0.95191
[255]	validation_0-mlogloss:0.95193
[256]	validation_0-mlogloss:0.95190
[257]	validation_0-mlogloss:0.95187
[258]	validation_0-mlogloss:

[32m[I 2023-05-19 19:14:47,722][0m Trial 3 finished with value: 0.5260353569679942 and parameters: {'lambda': 0.08883456155786487, 'alpha': 0.8824272604442192, 'colsample_bytree': 0.5724958351161413, 'subsample': 0.7021665906928218, 'learning_rate': 0.037887919080608075, 'n_estimators': 408, 'max_depth': 5, 'min_child_weight': 7}. Best is trial 1 with value: 0.5276626134220787.[0m


[0]	validation_0-mlogloss:1.09578




[1]	validation_0-mlogloss:1.09235
[2]	validation_0-mlogloss:1.08900
[3]	validation_0-mlogloss:1.08542
[4]	validation_0-mlogloss:1.08351
[5]	validation_0-mlogloss:1.08013
[6]	validation_0-mlogloss:1.07769
[7]	validation_0-mlogloss:1.07529
[8]	validation_0-mlogloss:1.07214
[9]	validation_0-mlogloss:1.06909
[10]	validation_0-mlogloss:1.06649
[11]	validation_0-mlogloss:1.06475
[12]	validation_0-mlogloss:1.06177
[13]	validation_0-mlogloss:1.05906
[14]	validation_0-mlogloss:1.05644
[15]	validation_0-mlogloss:1.05402
[16]	validation_0-mlogloss:1.05155
[17]	validation_0-mlogloss:1.04984
[18]	validation_0-mlogloss:1.04740
[19]	validation_0-mlogloss:1.04516
[20]	validation_0-mlogloss:1.04309
[21]	validation_0-mlogloss:1.04084
[22]	validation_0-mlogloss:1.03868
[23]	validation_0-mlogloss:1.03661
[24]	validation_0-mlogloss:1.03492
[25]	validation_0-mlogloss:1.03293
[26]	validation_0-mlogloss:1.03165
[27]	validation_0-mlogloss:1.02999
[28]	validation_0-mlogloss:1.02819
[29]	validation_0-mlogloss:1.

[232]	validation_0-mlogloss:0.96002
[233]	validation_0-mlogloss:0.95997
[234]	validation_0-mlogloss:0.95993
[235]	validation_0-mlogloss:0.95990
[236]	validation_0-mlogloss:0.95988
[237]	validation_0-mlogloss:0.95986
[238]	validation_0-mlogloss:0.95984
[239]	validation_0-mlogloss:0.95978
[240]	validation_0-mlogloss:0.95972
[241]	validation_0-mlogloss:0.95969
[242]	validation_0-mlogloss:0.95963
[243]	validation_0-mlogloss:0.95960
[244]	validation_0-mlogloss:0.95958
[245]	validation_0-mlogloss:0.95957
[246]	validation_0-mlogloss:0.95953
[247]	validation_0-mlogloss:0.95947
[248]	validation_0-mlogloss:0.95945
[249]	validation_0-mlogloss:0.95941
[250]	validation_0-mlogloss:0.95936
[251]	validation_0-mlogloss:0.95931
[252]	validation_0-mlogloss:0.95931
[253]	validation_0-mlogloss:0.95929
[254]	validation_0-mlogloss:0.95925
[255]	validation_0-mlogloss:0.95926
[256]	validation_0-mlogloss:0.95921
[257]	validation_0-mlogloss:0.95919
[258]	validation_0-mlogloss:0.95917
[259]	validation_0-mlogloss:

[460]	validation_0-mlogloss:0.95496
[461]	validation_0-mlogloss:0.95494
[462]	validation_0-mlogloss:0.95493
[463]	validation_0-mlogloss:0.95493
[464]	validation_0-mlogloss:0.95492
[465]	validation_0-mlogloss:0.95491
[466]	validation_0-mlogloss:0.95489
[467]	validation_0-mlogloss:0.95486
[468]	validation_0-mlogloss:0.95486
[469]	validation_0-mlogloss:0.95486
[470]	validation_0-mlogloss:0.95484
[471]	validation_0-mlogloss:0.95483
[472]	validation_0-mlogloss:0.95482
[473]	validation_0-mlogloss:0.95482
[474]	validation_0-mlogloss:0.95481
[475]	validation_0-mlogloss:0.95479
[476]	validation_0-mlogloss:0.95478
[477]	validation_0-mlogloss:0.95477
[478]	validation_0-mlogloss:0.95477
[479]	validation_0-mlogloss:0.95476
[480]	validation_0-mlogloss:0.95473
[481]	validation_0-mlogloss:0.95473
[482]	validation_0-mlogloss:0.95473
[483]	validation_0-mlogloss:0.95472
[484]	validation_0-mlogloss:0.95470
[485]	validation_0-mlogloss:0.95470
[486]	validation_0-mlogloss:0.95471
[487]	validation_0-mlogloss:

[688]	validation_0-mlogloss:0.95330
[689]	validation_0-mlogloss:0.95329
[690]	validation_0-mlogloss:0.95330
[691]	validation_0-mlogloss:0.95330
[692]	validation_0-mlogloss:0.95331
[693]	validation_0-mlogloss:0.95329
[694]	validation_0-mlogloss:0.95329
[695]	validation_0-mlogloss:0.95328
[696]	validation_0-mlogloss:0.95325
[697]	validation_0-mlogloss:0.95326
[698]	validation_0-mlogloss:0.95325
[699]	validation_0-mlogloss:0.95325
[700]	validation_0-mlogloss:0.95323
[701]	validation_0-mlogloss:0.95322
[702]	validation_0-mlogloss:0.95322
[703]	validation_0-mlogloss:0.95322
[704]	validation_0-mlogloss:0.95322
[705]	validation_0-mlogloss:0.95321
[706]	validation_0-mlogloss:0.95320
[707]	validation_0-mlogloss:0.95322
[708]	validation_0-mlogloss:0.95320
[709]	validation_0-mlogloss:0.95319
[710]	validation_0-mlogloss:0.95319
[711]	validation_0-mlogloss:0.95319
[712]	validation_0-mlogloss:0.95317
[713]	validation_0-mlogloss:0.95317
[714]	validation_0-mlogloss:0.95316
[715]	validation_0-mlogloss:

[33m[W 2023-05-19 19:15:56,256][0m Trial 4 failed with parameters: {'lambda': 0.07945675931814679, 'alpha': 0.295788610481512, 'colsample_bytree': 0.7862420775267355, 'subsample': 0.5644727203725218, 'learning_rate': 0.022261837866600482, 'n_estimators': 908, 'max_depth': 4, 'min_child_weight': 12} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "c:\python38-64\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\hslio\AppData\Local\Temp\ipykernel_21980\520926400.py", line 15, in objective
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds=100)
  File "c:\python38-64\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "c:\python38-64\lib\site-packages\xgboost\sklearn.py", line 1490, in fit
    self._Booster = train(
  File "c:\python38-64\lib\site-packages\xgboost\core.py", line 620, in inner_f
   

KeyboardInterrupt: 

In [None]:
# predict() 함수는 독립변수(테스트데이터)를 입력받았을 때 종속변수를 예측합니다.

pred = xgb.predict(test)

In [None]:
# 제출 파일을 읽어옵니다.
submit = pd.read_csv('data/sample_submission.csv')

In [None]:
pred

In [None]:
# 예측한 값을 TARGET 컬럼에 할당합니다.
submit['TARGET'] = pred
# submit.head()
submit.TARGET.value_counts()

In [None]:
# 예측한 결과를 파일로 저장합니다. index 인자의 값을 False로 설정하지 않으면 제출이 정상적으로 진행되지 않습니다.
submit.to_csv('230519_submit_xgb.csv', index = False)