# This script trains a model from the dataset obtained from data_processing.ipynb file     
**We can choose from multiple models**

# Import pandas and sklearn

In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

# Load our dataset from csv file

In [32]:
data = pd.read_csv("agg_processed_with_classes.csv")
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,class
0,727,-99,798,-1380,-1380,-527,-1380,-1380,74,616,...,0,0,0,0,0,0,0,0,0,1
1,517,-1380,-1380,-1380,-446,126,-51,1380,788,-1103,...,0,0,0,0,0,0,0,0,0,1
2,517,-1380,-1380,-1380,-446,126,-51,1380,788,-1103,...,0,0,0,0,0,0,0,0,0,1
3,517,-1380,-1380,-571,80,-287,-287,1021,-1113,-24,...,0,0,0,0,0,0,0,0,0,1
4,517,-1380,-1380,-571,80,-287,-287,1030,-1109,1021,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12338,659,-147,6,45,256,-1398,-1408,-1388,-1408,-516,...,0,0,0,0,0,0,0,0,0,1
12339,457,-1408,6,-1408,-1408,-831,74,220,-1408,-1408,...,-24,80,0,0,0,0,0,0,0,1
12340,517,-99,344,-1420,-1420,-1338,614,-459,-1420,-39,...,0,0,0,0,0,0,0,0,0,1
12341,517,-99,351,-1408,-1408,-1408,-183,711,-103,-349,...,0,0,0,0,0,0,0,0,0,1


# Choose a model to train
**'rf' for Random Forest Classifier   
'gbt' for Gradient Boosting Classifier   
'hgbt' for Histogram Gradient Boosting Classifier**

In [33]:
model_type = input("Which model you want to train?: ")

# Create a parameter grid
**Model with combination of its different hyper parameters**

In [34]:
param_grids = {
    "rf": {
        "estimator__n_estimators": [100, 300, 600],
        "estimator__max_depth": [5, 10, 15, None],
        "estimator__min_samples_split": [2, 5, 10],
        "estimator__min_samples_leaf": [1, 2, 4]
    },
    "gbt": {
        "estimator__n_estimators": [100, 300, 600],
        "estimator__max_depth": [3, 5, 8],
        "estimator__subsample": [0.5, 0.7, 0.9]
    },
    "hgbt": {
        "estimator__max_iter": [100, 300, 600],
        "estimator__max_leaf_nodes": [15, 20, 31]
    }
}

# Create a method to train out model        
**This method takes the model type and train datasets as parameters. It uses GridSearchCV to test several hyperparameter combinations and picks the best model**

In [35]:
def train_model(model_type, X_train, y_train):
  if model_type not in param_grids.keys():
    raise NotImplementedError("The desired model was not found")

  base_pipe = Pipeline([("scaler", StandardScaler()),
                        ("estimator", {
                  "rf": RandomForestClassifier(),
                  "gbt": GradientBoostingClassifier(),
                  "hgbt": HistGradientBoostingClassifier(),
              }[model_type])])

  model = GridSearchCV(
              base_pipe,
              param_grids[model_type],
              cv=5,
              n_jobs=16,
              verbose=2,
              scoring='accuracy',
              error_score='raise',
              refit=True
        )

  return model.fit(X_train, y_train)

**This method is here, if you want to train only one model with specific parameters**

In [36]:
def train_model_quick(model_type, X_train, y_train, args):
    if model_type not in param_grids.keys():
        raise NotImplementedError("The desired model was not found")
    
    base_pipe = Pipeline([
              ("scaler", StandardScaler()),
              ("estimator", {
                  "gbt": GradientBoostingClassifier(n_estimators=args['trees'], max_depth=args['depth'], subsample=args['subsample'], verbose=1),
                  "hgbt": HistGradientBoostingClassifier(max_iter=args['trees'], max_leaf_nodes=args['leaf_nodes'], early_stopping=False, verbose=1),
                  "rf": RandomForestClassifier(n_estimators=args['trees'], max_depth=args['depth'], min_samples_leaf=['min_leaf'], min_samples_split=['min_split'] ,verbose=1),
              }[model_type]),
          ])
    
    return base_pipe.fit(X_train, y_train)

# We need to define the input features and target feature we want to classify

In [37]:
input_features = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']
target_feature = 'class'

# We will split our dataset into training and testing data 9 to 1 (90% training, 10% testing)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(data[input_features], data[target_feature], test_size=0.1)

# Run the function to train our model

**Create model with specific parameters**

In [24]:
my_model = train_model_quick(model_type, X_train, y_train, {'trees': 100, 'depth': 15, 'min_leaf': 3, 'min_split': 2, 'subsample': 0.5, 'leaf_nodes': 15})

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.2142          -0.1858           26.74s
         2           0.4009       -2425.5325           51.85s
         3   198628742.8921  -484464745.8378            1.05m
         4   264837609.7088    61792510.9218            1.13m
         5   455776276.4774   189284245.8494            1.18m
         6   387001040.6498   -68775235.8464            1.21m
         7   364352003.4141   -22649037.1839            1.23m
         8   303918211.8450   -60436593.1211            1.24m
         9   484664109.8509   180745898.0233            1.24m
        10   422378769.3656   -62285340.4704            1.24m


KeyboardInterrupt: 

**Use the precofigured combination of hyperparameters and choose the best**

In [39]:
my_model = train_model(model_type, X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




**See which hyper parameters where chosen as the best**

In [40]:
my_model.best_params_

{'estimator__max_iter': 100, 'estimator__max_leaf_nodes': 15}

**See the models predictions .. does kinda nothing, is here just for fun**

In [41]:
y_pred = my_model.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

# Score our model using the score() method. The closer to 1.0 the better

In [42]:
my_model.score(X_test, y_test)

0.9149797570850202

# Export the model

In [44]:
import pickle

**Export our model using pickle**

In [45]:
pickle.dump(my_model, open("network_classificator_home_hgbt.dat", "wb"))