In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from sklearn.preprocessing import LabelEncoder

In [2]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20180306-3).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 10 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/n

In [3]:
plt.rc('font', family='NanumBarunGothic') 

In [4]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

seed_everything(42)

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#### Encoding

요일 > one-hot encoding

범죄 발생지 > Label encoding

In [6]:
train['요일'].unique()
train['요일'].value_counts()

pd.get_dummies(train['요일'])

Unnamed: 0,금요일,목요일,수요일,월요일,일요일,토요일,화요일
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...
84401,0,0,0,0,1,0,0
84402,0,1,0,0,0,0,0
84403,0,0,0,0,1,0,0
84404,0,0,0,0,0,0,1


In [7]:
train = pd.concat([train, pd.get_dummies(train['요일'])], axis=1)
print(train.shape)
train.head(3)

(84406, 27)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),...,눈날림,범죄발생지,TARGET,금요일,목요일,수요일,월요일,일요일,토요일,화요일
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,...,0.0,차도,2,0,0,0,0,0,0,1
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,...,0.0,차도,0,0,0,0,0,0,0,1
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,...,0.0,인도,1,0,0,0,0,1,0,0


In [8]:
test = pd.concat([test, pd.get_dummies(test['요일'])], axis=1)
print(test.shape)
test.head(3)

(17289, 26)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),...,연기/연무,눈날림,범죄발생지,금요일,목요일,수요일,월요일,일요일,토요일,화요일
0,TEST_00000,9,금요일,5,927,28.0,1.570654,19.625,0.0,0.0,...,0.0,0.0,차도,1,0,0,0,0,0,0
1,TEST_00001,5,수요일,3,926,28.0,1.712457,21.444444,0.0,0.0,...,1.0,0.0,식당,0,0,1,0,0,0,0
2,TEST_00002,5,월요일,6,1437,33.0,0.447496,25.2,0.0,0.0,...,0.0,0.0,주거지,0,0,0,1,0,0,0


In [9]:
train["범죄발생지"].value_counts()

주거지      36077
차도       25879
인도        6437
편의점       4835
주차장       3262
식당        1806
백화점       1493
주유소       1324
공원         736
학교         728
약국         653
호텔/모텔      591
병원         453
은행         132
Name: 범죄발생지, dtype: int64

오름차순으로 큰 수부터 차례대로 배정

In [10]:
train["범죄발생지"].value_counts().index

Index(['주거지', '차도', '인도', '편의점', '주차장', '식당', '백화점', '주유소', '공원', '학교', '약국',
       '호텔/모텔', '병원', '은행'],
      dtype='object')

In [11]:
dict_place = {'주거지':14, '차도':13, '인도':12, '편의점':11, '주차장':10, '식당':9, '백화점':8,
              '주유소':7, '공원':6, '학교':5, '약국':4, '호텔/모텔':3, '병원':2, '은행':1}

dict_place.get('차도')

13

In [12]:
train.loc[1, "범죄발생지"]

'차도'

In [13]:
dict_place.get(train.loc[1, "범죄발생지"])

13

In [14]:
for loc in range(0, train.shape[0]):
  encd = dict_place.get(train.loc[loc, "범죄발생지"])
  train.loc[loc, '발생지(encode)'] = encd

print(train.shape)
train.head(3)

(84406, 28)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),...,범죄발생지,TARGET,금요일,목요일,수요일,월요일,일요일,토요일,화요일,발생지(encode)
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,...,차도,2,0,0,0,0,0,0,1,13.0
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,...,차도,0,0,0,0,0,0,0,1,13.0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,...,인도,1,0,0,0,0,1,0,0,12.0


In [15]:
for loc in range(0, test.shape[0]):
  encd = dict_place.get(test.loc[loc, "범죄발생지"])
  test.loc[loc, '발생지(encode)'] = encd

print(test.shape)
test.head(3)

(17289, 27)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),...,눈날림,범죄발생지,금요일,목요일,수요일,월요일,일요일,토요일,화요일,발생지(encode)
0,TEST_00000,9,금요일,5,927,28.0,1.570654,19.625,0.0,0.0,...,0.0,차도,1,0,0,0,0,0,0,13.0
1,TEST_00001,5,수요일,3,926,28.0,1.712457,21.444444,0.0,0.0,...,0.0,식당,0,0,1,0,0,0,0,9.0
2,TEST_00002,5,월요일,6,1437,33.0,0.447496,25.2,0.0,0.0,...,0.0,주거지,0,0,0,1,0,0,0,14.0


#### 데이터 나누기

In [20]:
train.columns

Index(['ID', '월', '요일', '시간', '소관경찰서', '소관지역', '사건발생거리', '강수량(mm)', '강설량(mm)',
       '적설량(cm)', '풍향', '안개', '짙은안개', '번개', '진눈깨비', '서리', '연기/연무', '눈날림',
       '범죄발생지', 'TARGET', '금요일', '목요일', '수요일', '월요일', '일요일', '토요일', '화요일',
       '발생지(encode)'],
      dtype='object')

In [21]:
f_names = ['월', '시간', '소관경찰서', '발생지(encode)', '금요일', '목요일', '수요일', '월요일', '일요일', '토요일', '화요일']


In [22]:
X_train = train[f_names]
y_train = train['TARGET']
X_test = test[f_names]

In [23]:
X_train.shape, X_test.shape

((84406, 11), (17289, 11))

#### SMOTE

Oversampling

붓스트래핑이나 KNN 모델 기법을 활용함

In [24]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

ros = RandomOverSampler(random_state=19)
smote = SMOTE(random_state=19)

X_Rover, y_Rover = ros.fit_resample(X_train, y_train)
X_Sover, y_Sover = smote.fit_resample(X_train, y_train)

print(X_Rover.shape, y_Rover.shape)
print(X_Sover.shape, y_Sover.shape)

(109359, 11) (109359,)
(109359, 11) (109359,)


In [25]:
display(y_train.value_counts(),y_Rover.value_counts(),y_Sover.value_counts())

0    36453
1    25397
2    22556
Name: TARGET, dtype: int64

2    36453
0    36453
1    36453
Name: TARGET, dtype: int64

2    36453
0    36453
1    36453
Name: TARGET, dtype: int64

#### CatBoost

In [27]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [28]:
from catboost import CatBoostClassifier

cb_clf = CatBoostClassifier(random_state=19)
cb_clf1 = CatBoostClassifier(random_state=19)
cb_clf2 = CatBoostClassifier(random_state=19)

cb_clf.fit(X_train, y_train)
cb_clf1.fit(X_Rover, y_Rover)
cb_clf2.fit(X_Sover, y_Sover)

pred = cb_clf.predict(X_test)
pred1 = cb_clf1.predict(X_test)
pred2 = cb_clf2.predict(X_test)

print(pred, pred1, pred2)

Learning rate set to 0.099213
0:	learn: 1.0793600	total: 117ms	remaining: 1m 56s
1:	learn: 1.0634041	total: 183ms	remaining: 1m 31s
2:	learn: 1.0499438	total: 225ms	remaining: 1m 14s
3:	learn: 1.0393930	total: 260ms	remaining: 1m 4s
4:	learn: 1.0309752	total: 312ms	remaining: 1m 2s
5:	learn: 1.0243325	total: 373ms	remaining: 1m 1s
6:	learn: 1.0183422	total: 435ms	remaining: 1m 1s
7:	learn: 1.0132289	total: 477ms	remaining: 59.1s
8:	learn: 1.0090421	total: 518ms	remaining: 57.1s
9:	learn: 1.0052494	total: 553ms	remaining: 54.7s
10:	learn: 1.0022498	total: 583ms	remaining: 52.4s
11:	learn: 0.9994951	total: 607ms	remaining: 50s
12:	learn: 0.9976582	total: 631ms	remaining: 47.9s
13:	learn: 0.9957498	total: 659ms	remaining: 46.4s
14:	learn: 0.9940821	total: 684ms	remaining: 44.9s
15:	learn: 0.9927552	total: 709ms	remaining: 43.6s
16:	learn: 0.9914309	total: 741ms	remaining: 42.8s
17:	learn: 0.9901301	total: 785ms	remaining: 42.8s
18:	learn: 0.9892469	total: 811ms	remaining: 41.9s
19:	learn:

In [29]:
submission = pd.read_csv("/content/sample_submission.csv")
submission["TARGET"] = pred2
submission.head(5)

Unnamed: 0,ID,TARGET
0,TEST_00000,2
1,TEST_00001,0
2,TEST_00002,0
3,TEST_00003,0
4,TEST_00004,0


In [30]:
submission.to_csv("crime_submit_cat_smote.csv", index=False)