## This script trains a model from the dataset obtained from data_processing.ipynb file     
**We can choose from multiple models**

# Import pandas and sklearn

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# Load our dataset from csv file

In [3]:
data = pd.read_csv("final_dataset.csv")
data

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,class
0,0,517,-1240,-1240,-1231,80,-287,-287,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,517,-1368,-1368,-457,-457,64,-614,272,-31,...,0,0,0,0,0,0,0,0,0,1
2,2,517,-156,51,-69,68,-38,733,1024,1024,...,0,0,0,0,0,0,0,0,0,1
3,3,463,-1412,-1412,-1412,-305,80,485,-618,-31,...,0,0,0,0,0,0,0,0,0,1
4,4,561,-212,64,-586,46,37,-31,35,871,...,-539,-68,-230,215,-76,39,89,-83,39,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031090,1031090,517,-1238,-1238,-822,64,1238,442,-179,-62,...,0,0,0,0,0,0,0,0,0,1
1031091,1031091,212,-1330,-1330,-1330,-106,-1330,-589,93,-51,...,0,0,0,0,0,0,0,0,0,1
1031092,1031092,461,-1448,-1448,-621,80,345,-287,-287,-1448,...,-1448,-1448,-1448,-1448,-1448,-1448,-1448,-1448,-517,1
1031093,1031093,552,-133,-41,-1400,-1364,-246,-286,-74,80,...,72,72,76,76,72,72,1460,1309,1460,1


# Choose a model to train
**'rf' for Random Forest Classifier   
'gbt' for Gradient Boosting Classifier   
'hgbt' for Histogram Gradient Boosting Classifier**

In [15]:
model_type = input("Which model you want to train?: ")

# Create a parameter grid
**Model with combination of its different hyper parameters**

In [5]:
param_grids = {
    "rf": {
        "estimator__n_estimators": [100, 300, 600],
        "estimator__max_depth": [5, 10, 15, None],
        "estimator__min_samples_split": [2, 5, 10],
        "estimator__min_samples_leaf": [1, 2, 4]
    },
    "gbt": {
        "estimator__n_estimators": [100, 300, 600],
        "estimator__max_depth": [3, 5, 8],
        "estimator__subsample": [0.5, 0.7, 0.9]
    },
    "hgbt": {
        "estimator__max_iter": [100, 300, 600],
        "estimator__max_leaf_nodes": [15, 20, 31]
    }
}

# Create a method to train out model        
**This method takes the model type and train datasets as parameters. It uses GridSearchCV to test several hyperparameter combinations and picks the best model**

In [16]:
def train_model(model_type, X_train, y_train):
  if model_type not in param_grids.keys():
    raise NotImplementedError("The desired model was not found")

  base_pipe = Pipeline([("estimator", {
                  "rf": RandomForestClassifier(),
                  "gbt": GradientBoostingClassifier(),
                  "hgbt": HistGradientBoostingClassifier(),
              }[model_type])])

  model = GridSearchCV(
              base_pipe,
              param_grids[model_type],
              cv=5,
              n_jobs=3,
              verbose=2,
              scoring='accuracy',
              error_score='raise',
              refit=True
        )

  return model.fit(X_train, y_train)

# We need to define the input features and target feature we want to classify

In [7]:
input_features = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']
target_feature = 'class'

# We will split our dataset into training and testing data 9 to 1 (90% training, 10% testing)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data[input_features], data[target_feature], test_size=0.1)

# Run the function to train our model

In [17]:
my_model = train_model(model_type, X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END estimator__max_depth=3, estimator__n_estimators=100, estimator__subsample=0.5; total time=19.6min
[CV] END estimator__max_depth=3, estimator__n_estimators=100, estimator__subsample=0.5; total time=19.7min
[CV] END estimator__max_depth=3, estimator__n_estimators=100, estimator__subsample=0.5; total time=19.8min
[CV] END estimator__max_depth=3, estimator__n_estimators=100, estimator__subsample=0.5; total time=19.7min
[CV] END estimator__max_depth=3, estimator__n_estimators=100, estimator__subsample=0.5; total time=19.8min


KeyboardInterrupt: 

**See which hyper parameters where chosen as the best**

In [10]:
my_model.best_params_

{'estimator__max_iter': 300, 'estimator__max_leaf_nodes': 31}

**See the models predictions .. does kinda nothing, is here just for fun**

In [11]:
y_pred = my_model.predict(X_test)
y_pred

array([1, 2, 1, ..., 1, 1, 1], shape=(103110,))

# Score our model using the score() method. The closer to 1.0 the better

In [12]:
my_model.score(X_test, y_test)

0.9799631461545922

# Export the model

In [13]:
import pickle

**Export our model using pickle**

In [14]:
pickle.dump(my_model, open("network_classificator.dat", "wb"))