In [1]:
import os, sys, json
from pathlib import Path
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
import joblib

In [2]:
pos_name = input('enter position name : ')

data_path = Path('../train_data') / pos_name
model_path = Path('../ml_model') / pos_name
data_path.mkdir(parents=True, exist_ok=True)
model_path.mkdir(parents=True, exist_ok=True)

In [3]:
# data_path에 있는 모든 데이터파일을 dataframe으로 불러와 합치기 + 없는 값(NaN)을 0으로 채우기
train_dfs = [pd.read_csv(f, encoding='utf-8') for f in data_path.glob('**/*') if f.is_file()]
df_all = pd.concat(train_dfs, ignore_index=True).fillna(0)
# 'rp' column을 맨 뒤로 보내기
df_all = df_all[[col for col in df_all.columns if col != 'rp'] + ['rp']]
df_all

Unnamed: 0,88:36:6c:1d:d7:6e,88:36:6c:1d:d7:6c,88:36:6c:1d:ae:8c,88:36:6c:1d:db:e0,88:36:6c:1d:a7:e4,88:36:6c:1d:a5:14,88:36:6c:1d:c2:d0,88:36:6c:1d:ae:8e,88:36:6c:1d:a5:16,88:36:6c:1d:d8:6c,...,88:36:6c:1d:c9:1e,88:36:6c:1d:9e:44,88:36:6c:1d:da:f8,88:36:6c:1d:d1:70,88:36:6c:1d:e0:24,88:36:6c:1d:a1:d0,88:36:6c:1d:cc:b4,88:36:6c:1d:d7:e6,88:36:6c:1d:cf:c4,rp
0,70.0,94.0,84.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,751D문쪽
1,84.0,90.0,72.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,751D문쪽
2,80.0,92.0,85.0,60.0,57.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,751D문쪽
3,60.0,0.0,0.0,70.0,0.0,70.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,751D창쪽
4,62.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,751D창쪽
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,65.0,0.0,0.0,0.0,0.0,0.0,7층휴게실R
70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,62.0,0.0,0.0,0.0,53.0,0.0,7층휴게실R
71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7층휴게실R
72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,67.0,0.0,0.0,0.0,0.0,0.0,7층휴게실R


In [4]:
rp_encoder = LabelEncoder()
rp_encoder.fit(np.unique(df_all['rp']))
np.save(model_path / 'classes.npy', rp_encoder.classes_)
rp_encoder.classes_

array(['751D문쪽', '751D창쪽', '751세면대', '751앞계단', '751앞복도', '7층엘베', '7층휴게실L',
       '7층휴게실R'], dtype=object)

In [5]:
df_all['rp'] = rp_encoder.transform(df_all['rp'])
df_all['rp']

0     0
1     0
2     0
3     1
4     1
     ..
69    7
70    7
71    7
72    7
73    7
Name: rp, Length: 74, dtype: int32

In [6]:
X = df_all.iloc[:,:-1].values
y = df_all.iloc[:,-1].values

kf = KFold(n_splits=5, shuffle=True, random_state=12321)

In [7]:
fold_n = 1
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
    clf.fit(X_train, y_train)

    acc_train = accuracy_score(y_train, clf.predict(X_train))
    acc_test = accuracy_score(y_test, clf.predict(X_test))

    print(f'FOLD #{fold_n} TRAIN ACC: {acc_train:.2f} / TEST ACC: {acc_test:.2f}')
    fold_n += 1

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X, y)
joblib.dump(clf, model_path / 'model_rdf.plk')
print(f'random forest model for {pos_name} generated successfully')

FOLD #1 TRAIN ACC: 1.00 / TEST ACC: 0.87
FOLD #2 TRAIN ACC: 1.00 / TEST ACC: 0.80
FOLD #3 TRAIN ACC: 1.00 / TEST ACC: 1.00
FOLD #4 TRAIN ACC: 1.00 / TEST ACC: 1.00
FOLD #5 TRAIN ACC: 1.00 / TEST ACC: 1.00
random forest model for 신관 generated successfully


In [8]:
fold_n = 1
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    clf = svm.SVC(kernel = 'rbf')
    clf.fit(X_train, y_train)

    acc_train = accuracy_score(y_train, clf.predict(X_train))
    acc_test = accuracy_score(y_test, clf.predict(X_test))

    print(f'FOLD #{fold_n} TRAIN ACC: {acc_train:.2f} / TEST ACC: {acc_test:.2f}')
    fold_n += 1


clf = svm.SVC(kernel = 'rbf', probability=True)
clf.fit(X, y)
joblib.dump(clf, model_path /  'model_svm.plk')
print(f'svm model for {pos_name} generated successfully')

FOLD #1 TRAIN ACC: 0.95 / TEST ACC: 0.60
FOLD #2 TRAIN ACC: 0.90 / TEST ACC: 0.67
FOLD #3 TRAIN ACC: 0.92 / TEST ACC: 0.80
FOLD #4 TRAIN ACC: 0.90 / TEST ACC: 1.00
FOLD #5 TRAIN ACC: 0.95 / TEST ACC: 0.79
svm model for 신관 generated successfully
