https://dacon.io/competitions/official/236109/codeshare/8416?page=1&dtype=recent

In [None]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.preprocessing import LabelEncoder

#### Fixed Seed

모델링할 때, 모델 간 성능 비교를 위해서 seed 값 고정이 매우 중요함


seed가 고정되어 있어야 내가 제안한 모델의 성능이 좋은지 안좋은지 확인이 가능함

In [None]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

seed_everything(42)

#### 데이터 불러오기

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head(5)

Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2


#### 라벨 인코딩

In [None]:
ordinal_features = ['요일', '범죄발생지']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])

    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [None]:
train = train.drop('ID', axis = 1)
test = test.drop('ID', axis = 1)

In [None]:
y_train = train['TARGET']
X_train = train.drop('TARGET', axis=1)

#### Sample Weight 샘플 가중치

- 타켓 변수 y_train을 기반으로 균형 잡힌 데이터셋을 만들기 위해 샘플 가중치를 계산한다

- CreateBalancedSampleWeights 함수는 y_train과 largest_class_weight_coef 두 가지 인자를 받는다

- 샘플 가중치를 학습에 사용함으로써, 소수 클래스에 더 많은 중요성을 부여하거나 데이터셋의 다른 클래스로의 영향을 균형있게 조절할 수 있다

  https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html

y_train에서 고유한 클래스를 식별하고 unique 함수를 사용하여 정렬합니다.

np.bincount를 사용하여 각 클래스의 샘플 수를 계산합니다.

전체 샘플 수와 클래스 수를 계산합니다.

각 클래스의 가중치를 전체 샘플 수를 클래스 수와 각 클래스의 샘플 수의 곱으로 나눈 비율로 계산합니다.

각 클래스와 해당 가중치를 매핑하는 class_weight_dict 사전을 생성합니다.

두 번째 클래스 (classes[1])의 가중치를 largest_class_weight_coef로 곱합니다.

class_weight_dict에서 y_train의 각 샘플에 대한 가중치를 조회하여 sample_weights 리스트를 생성합니다.

In [None]:
def CreateBalancedSampleWeights(y_train, largest_class_weight_coef):
  classes = y_train.unique()
  classes.sort()
  class_samples = np.bincount(y_train)
  total_samples = class_samples.sum()
  n_classes = len(class_samples)
  weights = total_samples / (n_classes * class_samples * 1.0)
  class_weight_dict = {key:value for (key, value) in zip(classes, weights)}
  class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
  sample_weights = [class_weight_dict[y] for y in y_train]

  return sample_weights

train_sample_weight = CreateBalancedSampleWeights(train['TARGET'], largest_class_weight_coef=1)
train_sample_weight[:10]

[1.2473547319264644,
 0.7718249069578178,
 1.1078211337297057,
 1.1078211337297057,
 1.2473547319264644,
 0.7718249069578178,
 1.1078211337297057,
 0.7718249069578178,
 1.2473547319264644,
 1.2473547319264644]

#### XGBoost(optuna based)

In [None]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

model = XGBClassifier(random_state=1004, n_estimators=8228, max_depth=12,
                      min_child_weight=1, gamma=3, learning_rate=0.0001,
                      colsample_bytree=1.0, reg_lambda=0.767800554, 
                      reg_alpha=0.01232, subsample=0.6)

model.fit(X_train, y_train, sample_weight=train_sample_weight)

#### Predict

In [None]:
preds = model.predict(test)

In [None]:
submit = pd.read_csv('/content/sample_submission.csv')

submit['TARGET'] = preds
submit.head()

In [None]:
submit.to_csv('crime_submit_xgb.csv', index=False)

#### optuna 시각화