# Translate class names to numbers

In [2]:
import pandas as pd

**Import the our training and testing dataset**

In [3]:
data = pd.read_csv("final_dataframe_with_classes.csv")
data

FileNotFoundError: [Errno 2] No such file or directory: 'final_dataframe_with_classes.csv'

**Generate our classes -> Each different word in the Service column will be a new class**

In [None]:
def generate_classes():
  output_dict = {}
  services = []
  for service in [x for x in data['Service']]:
    if service not in services:
      services.append(service)

  current_annotation = 1

  while current_annotation <= len(services):
    output_dict[current_annotation] = services[current_annotation-1]
    current_annotation += 1

  return output_dict

classes = generate_classes()
classes

{1: 'github', 2: 'seznam', 3: 'facebook'}

**Replace class names with assigned numbers in our model**

In [1]:
for k, v in classes.items():
  data = data.replace(to_replace=v, value=k)

data

NameError: name 'classes' is not defined

# Train our model .. we can choose from several model types

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

**'rf' for Random Forest Classifier   
'gbt' for Gradient Boosting Classifier   
'hgbt' for Histogram Gradient Boosting Classifier**

In [None]:
model_type = input("Which model you want to train?: ")

Which model you want to train?: gbt


**Create a parameter grid -> Model with combination of its different hyper parameters**

In [None]:
param_grids = {
    "rf": {
        "estimator__n_estimators": [100, 300, 600],
        "estimator__max_depth": [5, 10, 15, None],
        "estimator__min_samples_split": [2, 5, 10],
        "estimator__min_samples_leaf": [1, 2, 4]
    },
    "gbt": {
        "estimator__n_estimators": [100, 300, 600],
        "estimator__max_depth": [3, 5, 8],
        "estimator__subsample": [0.5, 0.7, 0.9]
    },
    "hgbt": {
        "estimator__max_iter": [100, 300, 600],
        "estimator__max_leaf_nodes": [15, 20, 31]
    }
}

**Create a method to train out model -> This method takes the model type and train datasets as parameters. It uses GridSearchCV to test several hyperparameter combinations and picks the best model**

In [None]:
def train_model(model_type, X_train, y_train):
  if model_type not in param_grids.keys():
    raise NotImplementedError("The desired model was not found")

  base_pipe = Pipeline([("estimator", {
                  "rf": RandomForestClassifier(),
                  "gbt": GradientBoostingClassifier(),
                  "hgbt": HistGradientBoostingClassifier(),
              }[model_type])])

  model = GridSearchCV(
              base_pipe,
              param_grids[model_type],
              cv=5,
              n_jobs=3,
              verbose=2,
              scoring='accuracy',
              error_score='raise',
              refit=True
        )

  return model.fit(X_train, y_train)

**We need to define the input features and target feature we want to classify**

In [None]:
input_features = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']
target_feature = 'Service'

**We will split our dataset into training and testing data 9 to 1 (90% training, 10% testing)**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[input_features], data[target_feature], test_size=0.1)

**Run the function to train our model**

In [None]:
my_model = train_model(model_type, X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


**See which hyper parameters where chosen as the best**

In [None]:
my_model.best_params_

{'estimator__max_depth': 5,
 'estimator__n_estimators': 600,
 'estimator__subsample': 0.5}

**See the models predictions .. does kinda nothing, is here just for fun**

In [None]:
y_pred = my_model.predict(X_test)
y_pred

array([3, 3, 1, 1, 3, 3, 3])

**Score our model using the score() method. The closer to 1.0 the better**

In [None]:
my_model.score(X_test, y_test)

0.8571428571428571

# Export the model

In [None]:
import pickle

**Export our model using pickle**

In [None]:
pickle.dump(my_model, open("my_model.dat", "wb"))