In [1]:
from typing import Tuple, Callable, Dict
import pandas as pd
from sklearn.ensemble import *
from sklearn.svm import SVR
from sklearn.base import RegressorMixin
from sklearn.metrics import mean_squared_error
import pickle
import toothlib

cfg = toothlib.config

def train_model(
    train_test_data: Tuple[pd.DataFrame],
    method: Callable,
    params: Dict[str, float]
) -> Tuple[RegressorMixin, float, float]:
    X_train, X_test, y_train, y_test = train_test_data
    model = eval(method)(**params)
    model.fit(X_train, y_train)
    model.fit(X_train, y_train)
    # evaluating
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = model.score(X_test, y_test)
    return model, mse, r2


def train_group(train_test_data: Tuple[pd.DataFrame], group: str):
    X_train, X_test, y_train, y_test = train_test_data
    models = {}
    for fulln, model_info in cfg['ML']['models'].items():
        model, mse, r2 = train_model(
            (X_train, X_test, y_train, y_test),
            model_info['method'], model_info['params']
        )
        models[model_info['short_name']] = model
        print(f'[{model_info["short_name"]}]', f'MSE => {mse}  ', f'R2 => {r2}')
        if model_info['short_name'] != 'svr':
            print(f'[{model_info["short_name"]}]', f'{model.feature_importances_}')
        pickle.dump(model, open(model_info['save_path'].replace('.pkl', f'_{group}.pkl'), 'wb'))

In [2]:
# prepare data

import os
import pandas as pd
import numpy as np

'''Data preparing
'''
STATIC = 'static'


def load_data(group: str) -> Tuple[pd.DataFrame]:
    df = pd.read_csv(os.path.join(STATIC, cfg['dataset']['processed'][group]))
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    display(X)

    from sklearn.model_selection import train_test_split
    test_ratio = cfg['ML']['test_ratio']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_ratio, random_state=42
    )

    if cfg['ML']['shuffle']:
        row_num = int(X_train.size / X_train.columns.size)
        shuffle_index = np.random.permutation(row_num)
        X_train, y_train = X_train.iloc[shuffle_index], y_train.iloc[shuffle_index]
    return X_train, X_test, y_train, y_test

train_group(load_data('depth_csv'), 'depth')

Unnamed: 0,u,v,curv,metric_u,metric_v
0,0.00,0.00,0.000000,0.265557,0.016084
1,0.00,0.01,-177.777702,0.174043,0.123019
2,0.00,0.02,-52.854736,0.082505,0.086040
3,0.00,0.03,-20.190346,0.082505,0.085753
4,0.00,0.04,-3.609616,0.082522,0.085924
...,...,...,...,...,...
129995,0.99,0.95,386.866974,0.077496,0.083006
129996,0.99,0.96,7.138393,0.090812,0.081515
129997,0.99,0.97,6.739918,0.091597,0.081878
129998,0.99,0.98,17.887028,0.092280,0.082249


[rf] MSE => 0.043515986467459425   R2 => 0.7970055284698174
[rf] [0.67803445 0.06788892 0.01038431 0.14049095 0.10320137]
[gbr] MSE => 0.021576371480615805   R2 => 0.8993499979709361
[gbr] [0.57154648 0.09402425 0.02737799 0.16120786 0.14584341]


In [3]:
train_group(load_data('edge_csv'), 'edge')

Unnamed: 0,v,curv,metric_u
0,0.00,-0.225283,0.296775
1,0.01,43.830525,0.258710
2,0.02,50.418088,0.206845
3,0.03,49.903439,0.184762
4,0.04,46.833693,0.170455
...,...,...,...
1295,0.95,24.669416,0.111785
1296,0.96,14.954427,0.115487
1297,0.97,21.265292,0.121969
1298,0.98,36.729345,0.138567


[rf] MSE => 53.78320446610011   R2 => 0.7146464972941418
[rf] [0.12731994 0.14376601 0.72891405]
[gbr] MSE => 86.08309436684742   R2 => 0.5432754008396476
[gbr] [0.12309873 0.16287669 0.71402458]
