## Treniranje modela

Ovaj notebook korišten je podjelu skupa podataka u train i test skup i za treniranje modela na train skupu. 
Dobiveni modeli su zatim spremljeni u pickle datoteke i učitavani u glavnom notebook-u. 

Import svih potrebnih biblioteka:

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle

Spremanje imena svih korištenih stupaca iz CTG.xls datoteke (značajke Min i Median ne koristimo radi njihove jake
koreliranosti sa značajkama Max i Mean):

In [30]:
column_names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Variance', 'Tendency', 'NSP']

#stupci koji sadrže int vrijednosti, samo za ljepši ispis
int_columns = ['LB' ,'ASTV' ,'ALTV' ,'Width' ,'Max' ,'Nmax' ,'Nzeros' ,'Mode' ,'Mean' ,'Variance',
               'Tendency', 'NSP']

Učitavanje potrebnih podataka:

In [31]:
data = pd.read_excel('CTG.xls', sheet_name = 'Data', skiprows = 1, 
                     usecols = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 29, 30, 45]
                     , names = column_names)

#izbacujemo sve retke u kojima je barem jedna varijabla nepoznata (NaN)
data = data.dropna()

for col in int_columns:
    data[col] = (data[col]).astype(int)
    

Stratificirana podjela podataka u train i test skupove:

In [32]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=42)

Standardizacija podataka:

In [33]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

U slučaju ponovnog treniranja pojedinog modela potrebno je učitati već spremljenje test i train skupove:

In [34]:
#names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max','Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency']
#X_train = pd.read_csv('X_train.csv', names = names, header = None).values
#X_test = pd.read_csv('X_test.csv', names = names, header = None).values
#y_train = pd.read_csv('y_train.csv', names = ['NSP'], header = None).values
#y_test = pd.read_csv('y_test.csv', names = ['NSP'], header = None).values

U slučaju prvog treniranja modela potrebno je spremiti test i train skupove u datoteke:

In [35]:
np.savetxt(r'/home/lucijaval/Documents/strojno_treniranje/su-projekt-master/X_train.csv', X_train, delimiter=",")
np.savetxt(r'/home/lucijaval/Documents/strojno_treniranje/su-projekt-master/X_test.csv', X_test, delimiter=",")
np.savetxt(r'/home/lucijaval/Documents/strojno_treniranje/su-projekt-master/y_train.csv', y_train, delimiter=",")
np.savetxt(r'/home/lucijaval/Documents/strojno_treniranje/su-projekt-master/y_test.csv', y_test, delimiter=",")

GridSearchCV za SVC uz SMOTE oversampling i spremanje najboljeg modela u svc_SMOTE.pkl:

In [36]:
svc_s_pipeline = Pipeline([
        ('sampling', SMOTE()),
        ('classification', SVC(probability=True))
    ])
parameters_svc = [
    {'classification__C':[1, 10, 100, 1000], 'classification__kernel':['linear']},
    {'classification__C':[1, 10, 100, 1000], 'classification__gamma':[0.001, 0.0001], 'classification__kernel':['rbf']}
    ]

svc_s = GridSearchCV(svc_s_pipeline, parameters_svc, cv = 5)
svc_s.fit(X_train, y_train)

svc_s_best = svc_s.best_estimator_
with open("svc_SMOTE.pkl" , 'wb') as file:  
    pickle.dump(svc_s_best, file)

GridSearchCV za SVC uz BorderlineSMOTE oversampling i spremanje najboljeg modela u svc_BorderlineSMOTE.pkl:

In [37]:
svc_bs_pipeline = Pipeline([
        ('sampling', BorderlineSMOTE()),
        ('classification', SVC(probability=True))
    ]) 
svc_bs = GridSearchCV(svc_bs_pipeline, parameters_svc, cv = 5)
svc_bs.fit(X_train, y_train)

svc_bs_best = svc_bs.best_estimator_
with open("svc_BorderlineSMOTE.pkl" , 'wb') as file:  
    pickle.dump(svc_bs_best, file)

GridSearchCV za SVC uz ADASYN oversampling i spremanje najboljeg modela u svc_ADASYN.pkl:

In [38]:
svc_a_pipeline = Pipeline([
        ('sampling', ADASYN()),
        ('classification', SVC(probability=True))
    ]) 
svc_a = GridSearchCV(svc_a_pipeline, parameters_svc, cv = 5)
svc_a.fit(X_train, y_train)

svc_a_best = svc_a.best_estimator_
with open("svc_ADASYN.pkl" , 'wb') as file:  
    pickle.dump(svc_a_best, file)

GridSearchCV za XGB uz SMOTE oversampling i spremanje najboljeg modela u xgb_SMOTE.pkl:

In [39]:
xgb_s_pipeline = Pipeline([
        ('sampling', SMOTE()),
        ('classification', xgb.XGBClassifier())
    ])
parameters_xgb = {
        'classification__min_child_weight': [1, 5, 10],
        'classification__gamma': [0.5, 1, 1.5, 2, 5],
        'classification__subsample': [0.6, 0.8, 1.0],
        'classification__colsample_bytree': [0.6, 0.8, 1.0],
        'classification__max_depth': [3, 4, 5]
        }

xgb_s = GridSearchCV(xgb_s_pipeline, parameters_xgb, cv = 5)
xgb_s.fit(X_train, y_train.ravel())

xgb_s_best = xgb_s.best_estimator_
with open("xgb_SMOTE.pkl" , 'wb') as file:  
    pickle.dump(xgb_s_best, file)

GridSearchCV za XGB uz BorderlineSMOTE oversampling i spremanje najboljeg modela u xgb_BorderlineSMOTE.pkl:

In [40]:
xgb_bs_pipeline = Pipeline([
        ('sampling', BorderlineSMOTE()),
        ('classification', xgb.XGBClassifier())
    ])

xgb_bs = GridSearchCV(xgb_bs_pipeline, parameters_xgb, cv = 5)
xgb_bs.fit(X_train, y_train)

xgb_bs_best = xgb_bs.best_estimator_
with open("xgb_BorderlineSMOTE.pkl" , 'wb') as file:  
    pickle.dump(xgb_bs_best, file)

GridSearchCV za XGB uz ADASYN oversampling i spremanje najboljeg modela u xgb_ADASYN.pkl:

In [41]:
xgb_a_pipeline = Pipeline([
        ('sampling', ADASYN()),
        ('classification', xgb.XGBClassifier())
    ])

xgb_a = GridSearchCV(xgb_a_pipeline, parameters_xgb, cv = 5)
xgb_a.fit(X_train, y_train)

xgb_a_best = xgb_a.best_estimator_
with open("xgb_ADASYN.pkl" , 'wb') as file:  
    pickle.dump(xgb_a_best, file)

GridSearchCV za RF uz SMOTE oversampling i spremanje najboljeg modela u rf_SMOTE.pkl:

In [42]:
rf_s_pipeline = Pipeline([
        ('sampling', SMOTE()),
        ('classification', RandomForestClassifier(random_state=2018))
    ])
parameters_rf = {
    'classification__n_estimators': [16,32,64,128],
    'classification__max_features': list(range(1,20))
    }

rf_s = GridSearchCV(rf_s_pipeline, parameters_rf, cv = 5)
rf_s.fit(X_train, y_train)

rf_s_best = rf_s.best_estimator_
with open('rf_SMOTE.pkl' , 'wb') as file:  
    pickle.dump(rf_s_best, file)

GridSearchCV za RF uz BorderlineSMOTE oversampling i spremanje najboljeg modela u rf_BorderlineSMOTE.pkl:

In [44]:
rf_bs_pipeline = Pipeline([
        ('sampling', BorderlineSMOTE()),
        ('classification', RandomForestClassifier(random_state=2018))
    ])

rf_bs = GridSearchCV(rf_bs_pipeline, parameters_rf, cv = 5)
rf_bs.fit(X_train, y_train)

rf_bs_best = rf_bs.best_estimator_
with open("rf_BorderlineSMOTE.pkl" , 'wb') as file:  
    pickle.dump(rf_bs_best, file)

GridSearchCV za RF uz ADASYN oversampling i spremanje najboljeg modela u rf_ADASYN.pkl:

In [46]:
rf_a_pipeline = Pipeline([
        ('sampling', ADASYN()),
        ('classification', RandomForestClassifier(random_state=2018))
    ])

rf_a = GridSearchCV(rf_a_pipeline, parameters_rf, cv = 5)
rf_a.fit(X_train, y_train)

rf_a_best = rf_a.best_estimator_
with open("rf_ADASYN.pkl" , 'wb') as file:  
    pickle.dump(rf_a_best, file)

GridSearchCV za SVC bez oversamplinga i spremanje najboljeg modela u svc.pkl:

In [47]:
svc_s_pipeline = Pipeline([
        ('classification', SVC(probability=True))
    ])
parameters_svc = [
    {'classification__C':[1, 10, 100, 1000], 'classification__kernel':['linear']},
    {'classification__C':[1, 10, 100, 1000], 'classification__gamma':[0.001, 0.0001], 'classification__kernel':['rbf']}
    ]

svc_s = GridSearchCV(svc_s_pipeline, parameters_svc, cv = 5)
svc_s.fit(X_train, y_train)

svc_s_best = svc_s.best_estimator_
with open("svc.pkl" , 'wb') as file:  
    pickle.dump(svc_s_best, file)

GridSearchCV za XGB bez oversamplinga i spremanje najboljeg modela u xgb.pkl:

In [48]:
xgb_s_pipeline = Pipeline([
        ('classification', xgb.XGBClassifier())
    ])
parameters_xgb = {
        'classification__min_child_weight': [1, 5, 10],
        'classification__gamma': [0.5, 1, 1.5, 2, 5],
        'classification__subsample': [0.6, 0.8, 1.0],
        'classification__colsample_bytree': [0.6, 0.8, 1.0],
        'classification__max_depth': [3, 4, 5]
        }

xgb_s = GridSearchCV(xgb_s_pipeline, parameters_xgb, cv = 5)
xgb_s.fit(X_train, y_train.ravel())

xgb_s_best = xgb_s.best_estimator_
with open("xgb.pkl" , 'wb') as file:  
    pickle.dump(xgb_s_best, file)

GridSearchCV za RF bez oversamplinga i spremanje najboljeg modela u rf.pkl:

In [49]:
rf_s_pipeline = Pipeline([
        ('classification', RandomForestClassifier(random_state=2018))
    ])
parameters_rf = {
    'classification__n_estimators': [16,32,64,128],
    'classification__max_features': list(range(1,20))
    }

rf_s = GridSearchCV(rf_s_pipeline, parameters_rf, cv = 5)
rf_s.fit(X_train, y_train)

rf_s_best = rf_s.best_estimator_
with open('rf.pkl' , 'wb') as file:  
    pickle.dump(rf_s_best, file)