# Library Setting

In [1]:
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

<br></br>

# Data

## Data Load

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [3]:
train_df.shape, test_df.shape

((84406, 20), (17289, 19))

In [4]:
train_df.head()

Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2


<br>

## Resetting Columns Type

In [5]:
def type_resetting(data):
    d = data.copy()
    target_feature = ['TARGET']
    cat_features = ['ID','월','요일','시간','소관경찰서','소관지역','범죄발생지']
    num_features = [col for col in d.columns if col not in cat_features]
    for col in cat_features:
        d[col] = d[col].astype(str)
    for col in num_features:
        d[col] = d[col].astype(float)
    if 'TARGET' in d.columns:
        d['TARGET'] = d['TARGET'].astype(str)
        d = d.rename(columns={'TARGET':'target'})
    return d

In [6]:
unuse_features = ['ID']

train_df2 = type_resetting(train_df)
test_df2  = type_resetting(test_df)

train_df2.drop(columns=unuse_features,inplace=True)
test_df2 .drop(columns=unuse_features,inplace=True)

<br></br>

# New Features

In [7]:
def new_features(data):
    d = data.copy()
    
    # (1) 강수,강설,적설여부 : 강수량이 0이면 강수여부=0, 강수
    d['강수여부'] = np.where(d['강수량(mm)']==0,0,1)
    d['강설여부'] = np.where(d['강설량(mm)']==0,0,1)
    d['적설여부'] = np.where(d['적설량(cm)']==0,0,1)
    
    # (2) 주말여부, 계절
    d['주말여부'] = np.where((d['요일']=='토요일')|(d['요일']=='일요일'),1,0)
    d['계절'] = ['겨울' if month in [12,1,2] else
                '봄'  if month in [3,4,5]  else
                '여름' if month in [6,7,8] else
                '가을' if month in [9,10,11] else
                'Unknown' for month in d['월'].astype(int)]
    
    # (3) 강수 grouping
    # - 참조 : https://namu.wiki/w/%EA%B0%95%EC%9A%B0%EB%9F%89#s-2
    d['강수량_체감'] = ['0'  if rainfall==0   else
                     '1'  if rainfall<=1   else
                     '2'  if rainfall<=2.5 else
                     '3'  if rainfall<=5.0 else
                     '4'  if rainfall<=10  else
                     '5'  if rainfall<=15  else
                     '6'  if rainfall<=20  else
                     '7'  if rainfall<=30  else
                     '8'  if rainfall<=40  else
                     '9'  if rainfall<=50  else
                     '10' if rainfall<=70  else
                     '11' if rainfall<=110 else
                     '12' for rainfall in d['강수량(mm)']]
    
    # (4) 풍향
    d['풍향_방위'] = ['N'  if (direction>=337.5) or  (direction< 22.5) else
                    'NE' if (direction>= 22.5) and (direction< 67.5) else
                    'E'  if (direction>= 67.5) and (direction<112.5) else
                    'SE' if (direction>=112.5) and (direction<157.5) else
                    'S'  if (direction>=157.5) and (direction<202.5) else
                    'SW' if (direction>=202.5) and (direction<247.5) else
                    'W'  if (direction>=247.5) and (direction<292.5) else
                    'NW' if (direction>=292.5) and (direction<337.5) else
                    'Unknown' for direction in d['풍향']]
    
    return d

In [8]:
train_df3 = new_features(train_df2)
test_df3  = new_features(test_df2)

In [9]:
target_feature = 'target'
cat_features = train_df3.columns[train_df3.dtypes==object]
cat_features = list(set(cat_features)-set([target_feature]))
num_features = [col for col in train_df3.columns if col not in cat_features+[target_feature]]

In [10]:
train_df3.head()

Unnamed: 0,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,target,강수여부,강설여부,적설여부,주말여부,계절,강수량_체감,풍향_방위
0,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2.0,0,0,0,0,가을,0,SW
1,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0.0,0,0,0,0,가을,0,S
2,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1.0,0,0,0,1,여름,0,NE
3,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1.0,1,0,0,0,봄,5,SW
4,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2.0,0,0,0,1,가을,0,W


<br></br>

# EDA

In [13]:
# i=0
# for col in num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.violinplot(x=train_df3['target'],y=train_df3[col])
#     plt.show()

<br></br>

# Add the Interaction Term

In [14]:
import warnings
from tqdm import trange
def add_interaction_term(data,num_features):
    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
    
    d = data.copy()
    for i in range(len(num_features)):
        for j in range(len(num_features)):
            if i>j:
                col_i = num_features[i]
                col_j = num_features[j]
                d[f'{col_i}*{col_j}'] = d[col_i]*d[col_j]
                
    return d

In [15]:
train_df4 = add_interaction_term(train_df3,num_features)
test_df4  = add_interaction_term(test_df3 ,num_features)

In [16]:
target_feature = 'target'
cat_features = train_df4.columns[train_df4.dtypes==object]
cat_features = list(set(cat_features)-set([target_feature]))
num_features = [col for col in train_df4.columns if col not in cat_features+[target_feature]]

<br></br>

# Feature Selection

In [17]:
# import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [19]:
pvalue_list = []
for col in tqdm(num_features):
    d = train_df4[[col,'target']].rename(columns={col:'feature'})
    
    model = ols(f'feature ~ C(target)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list.append([col,pvalue])

100%|██████████| 136/136 [00:42<00:00,  3.21it/s]


In [20]:
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

pvalue_df[pvalue_df.pvalue>=0.05].round(4)

Unnamed: 0,feature,pvalue
65,연기/연무*풍향,0.9556
88,강수여부*짙은안개,0.9136
32,짙은안개*강수량(mm),0.9084
35,짙은안개*풍향,0.8769
113,적설여부*짙은안개,0.8632
10,연기/연무,0.8591
34,짙은안개*적설량(cm),0.8396
66,연기/연무*안개,0.8337
100,강설여부*짙은안개,0.7973
68,연기/연무*번개,0.7782


<br></br>

# Modeling

In [21]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score

In [22]:
n_splits = 10
iterations = 5000
learning_rate = 0.03
random_state = 0

In [23]:
X = train_df4.drop(target_feature,axis=1)
y = train_df4[target_feature]

skf = StratifiedKFold(n_splits=n_splits)
models = []
scores = []

k=0
for train_idx, valid_idx in tqdm(skf.split(X,y),total=n_splits):
    k+=1
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
    
    train_dataset = Pool(X_train,y_train,cat_features=cat_features)
    valid_dataset = Pool(X_valid,y_valid,cat_features=cat_features)

    model = CatBoostClassifier(
        random_state=random_state,
        iterations=iterations,
        learning_rate=learning_rate,
        allow_writing_files=False,
    )
    model.fit(
        train_dataset,
        eval_set=valid_dataset,
        metric_period=int(iterations/5),
        early_stopping_rounds=int(iterations/5),
    )
    #model.save_model(f'./model_checkpoints/kfold_model_{k}.cbm')
    
    y_pred = model.predict(valid_dataset).flatten()
    y_true = y_valid.values
    score = f1_score(y_true=y_true,y_pred=y_pred,average='macro')
    
    print('K-Fold {}, Macro F1 Score: {:.4f}'.format(k,score))
    
    models.append(model)
    scores.append(score)



0:	learn: 1.0912754	test: 1.0913207	best: 1.0913207 (0)	total: 163ms	remaining: 13m 35s
1000:	learn: 0.9332570	test: 0.9553949	best: 0.9553876 (998)	total: 1m 40s	remaining: 6m 40s
2000:	learn: 0.9103208	test: 0.9543703	best: 0.9543382 (1916)	total: 3m 30s	remaining: 5m 14s
3000:	learn: 0.8886717	test: 0.9538221	best: 0.9537743 (2911)	total: 5m 19s	remaining: 3m 32s
4000:	learn: 0.8676297	test: 0.9543466	best: 0.9536965 (3131)	total: 7m 9s	remaining: 1m 47s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9536964652
bestIteration = 3131

Shrink model to first 3132 iterations.


 10%|█         | 1/10 [07:24<1:06:44, 444.97s/it]

K-Fold 1, Macro F1 Score: 0.5207




0:	learn: 1.0915495	test: 1.0914052	best: 1.0914052 (0)	total: 98.6ms	remaining: 8m 12s
1000:	learn: 0.9338254	test: 0.9544255	best: 0.9544255 (1000)	total: 1m 48s	remaining: 7m 11s
2000:	learn: 0.9104151	test: 0.9537730	best: 0.9537585 (1740)	total: 3m 40s	remaining: 5m 30s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9537584534
bestIteration = 1740

Shrink model to first 1741 iterations.


 20%|██        | 2/10 [12:28<48:15, 361.91s/it]  

K-Fold 2, Macro F1 Score: 0.5301




0:	learn: 1.0913073	test: 1.0912482	best: 1.0912482 (0)	total: 91.8ms	remaining: 7m 38s
1000:	learn: 0.9332133	test: 0.9550335	best: 0.9550306 (998)	total: 1m 45s	remaining: 7m 2s
2000:	learn: 0.9096015	test: 0.9541865	best: 0.9541626 (1948)	total: 3m 36s	remaining: 5m 24s
3000:	learn: 0.8873045	test: 0.9539336	best: 0.9539104 (2983)	total: 5m 24s	remaining: 3m 36s
4000:	learn: 0.8658252	test: 0.9538708	best: 0.9538156 (3976)	total: 7m 16s	remaining: 1m 49s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9538156037
bestIteration = 3976

Shrink model to first 3977 iterations.


 30%|███       | 3/10 [21:33<51:56, 445.22s/it]

K-Fold 3, Macro F1 Score: 0.5209




0:	learn: 1.0912834	test: 1.0911736	best: 1.0911736 (0)	total: 98.8ms	remaining: 8m 13s
1000:	learn: 0.9328735	test: 0.9577779	best: 0.9577192 (932)	total: 1m 45s	remaining: 6m 59s
2000:	learn: 0.9090370	test: 0.9565107	best: 0.9564600 (1852)	total: 3m 31s	remaining: 5m 16s
3000:	learn: 0.8867312	test: 0.9565603	best: 0.9563108 (2440)	total: 5m 30s	remaining: 3m 40s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9563107531
bestIteration = 2440

Shrink model to first 2441 iterations.


 40%|████      | 4/10 [27:51<41:54, 419.02s/it]

K-Fold 4, Macro F1 Score: 0.5283




0:	learn: 1.0912898	test: 1.0911633	best: 1.0911633 (0)	total: 83.5ms	remaining: 6m 57s
1000:	learn: 0.9339203	test: 0.9507823	best: 0.9507764 (998)	total: 1m 45s	remaining: 6m 59s
2000:	learn: 0.9104542	test: 0.9495053	best: 0.9493762 (1932)	total: 3m 35s	remaining: 5m 23s
3000:	learn: 0.8879462	test: 0.9493883	best: 0.9493062 (2871)	total: 5m 28s	remaining: 3m 38s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9493062071
bestIteration = 2871

Shrink model to first 2872 iterations.


 50%|█████     | 5/10 [34:53<34:59, 419.90s/it]

K-Fold 5, Macro F1 Score: 0.5283




0:	learn: 1.0911912	test: 1.0913705	best: 1.0913705 (0)	total: 90ms	remaining: 7m 29s
1000:	learn: 0.9331821	test: 0.9592079	best: 0.9592079 (1000)	total: 1m 42s	remaining: 6m 49s
2000:	learn: 0.9097428	test: 0.9576721	best: 0.9576688 (1998)	total: 3m 28s	remaining: 5m 12s
3000:	learn: 0.8876152	test: 0.9579893	best: 0.9576685 (2002)	total: 5m 19s	remaining: 3m 33s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9576685463
bestIteration = 2002

Shrink model to first 2003 iterations.


 60%|██████    | 6/10 [40:14<25:44, 386.21s/it]

K-Fold 6, Macro F1 Score: 0.5209




0:	learn: 1.0913413	test: 1.0911253	best: 1.0911253 (0)	total: 90.9ms	remaining: 7m 34s
1000:	learn: 0.9336059	test: 0.9522815	best: 0.9522709 (996)	total: 1m 42s	remaining: 6m 50s
2000:	learn: 0.9097525	test: 0.9517178	best: 0.9516517 (1737)	total: 3m 27s	remaining: 5m 11s
3000:	learn: 0.8873729	test: 0.9521076	best: 0.9516146 (2176)	total: 5m 12s	remaining: 3m 28s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9516146402
bestIteration = 2176

Shrink model to first 2177 iterations.


 70%|███████   | 7/10 [45:46<18:25, 368.52s/it]

K-Fold 7, Macro F1 Score: 0.5315




0:	learn: 1.0912654	test: 1.0912577	best: 1.0912577 (0)	total: 93.1ms	remaining: 7m 45s
1000:	learn: 0.9337555	test: 0.9511083	best: 0.9511063 (976)	total: 1m 41s	remaining: 6m 46s
2000:	learn: 0.9102722	test: 0.9502144	best: 0.9501981 (1966)	total: 3m 26s	remaining: 5m 10s
3000:	learn: 0.8881909	test: 0.9500503	best: 0.9498829 (2564)	total: 5m 11s	remaining: 3m 27s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9498829037
bestIteration = 2564

Shrink model to first 2565 iterations.


 80%|████████  | 8/10 [51:58<12:19, 369.58s/it]

K-Fold 8, Macro F1 Score: 0.5334




0:	learn: 1.0914227	test: 1.0914441	best: 1.0914441 (0)	total: 94.4ms	remaining: 7m 51s
1000:	learn: 0.9339823	test: 0.9549176	best: 0.9549146 (988)	total: 1m 42s	remaining: 6m 47s
2000:	learn: 0.9105646	test: 0.9545786	best: 0.9543960 (1815)	total: 3m 27s	remaining: 5m 10s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9543959511
bestIteration = 1815

Shrink model to first 1816 iterations.


 90%|█████████ | 9/10 [56:52<05:46, 346.19s/it]

K-Fold 9, Macro F1 Score: 0.5293




0:	learn: 1.0913111	test: 1.0911598	best: 1.0911598 (0)	total: 97.7ms	remaining: 8m 8s
1000:	learn: 0.9344133	test: 0.9533156	best: 0.9532756 (990)	total: 1m 43s	remaining: 6m 53s
2000:	learn: 0.9113436	test: 0.9526017	best: 0.9524942 (1826)	total: 3m 28s	remaining: 5m 13s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.9524942431
bestIteration = 1826

Shrink model to first 1827 iterations.


100%|██████████| 10/10 [1:01:50<00:00, 371.03s/it]

K-Fold 10, Macro F1 Score: 0.5252





In [24]:
import pickle
with open('./model_checkpoints/kfold_models.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)

<br></br>

# Inference

In [31]:
import pickle
with open('./model_checkpoints/kfold_models_10.pkl', 'rb') as f:
    models = pickle.load(f)

<br>

## Train dataset

In [32]:
pred_dict = {}
i=0
for model in tqdm(models):
    i+=1
    dataset = Pool(train_df4.drop(target_feature,axis=1),cat_features=cat_features)
    pred_dict[str(i)] = model.predict(dataset).flatten()

100%|██████████| 10/10 [00:05<00:00,  1.76it/s]


In [33]:
preds_hard_voting = pd.DataFrame(pred_dict)\
    .progress_apply(lambda x: x.value_counts().sort_values(ascending=False).index.tolist()[0],axis=1)

100%|██████████| 84406/84406 [00:15<00:00, 5356.10it/s]


In [34]:
display(pd.crosstab(preds_hard_voting.values,y,rownames=['pred'],colnames=['true']))
score = f1_score(y_true=train_df4[target_feature],y_pred=preds_hard_voting.values,average='macro')
print('Macro F1 Score: {:.3f}'.format(score))

true,0.0,1.0,2.0
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,26797,9122,10058
1.0,4280,12275,3246
2.0,5376,4000,9252


Macro F1 Score: 0.548


<br>

## Test dataset

In [35]:
pred_dict = {}
i=0
for model in tqdm(models):
    i+=1
    dataset = Pool(test_df4,cat_features=cat_features)
    pred_dict[str(i)] = model.predict(dataset).flatten()

100%|██████████| 10/10 [00:01<00:00,  7.72it/s]


In [36]:
preds_hard_voting = pd.DataFrame(pred_dict)\
    .progress_apply(lambda x: x.value_counts().sort_values(ascending=False).index.tolist()[0],axis=1)

100%|██████████| 17289/17289 [00:03<00:00, 5195.74it/s]


In [37]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['TARGET'] = preds_hard_voting
submit['TARGET'] = submit['TARGET'].astype(float).astype(int)
submit.to_csv('./out/submission_kfold10.csv',index=False)