# Validation and Training

## Import Packages and Dataset

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import math
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,hamming_loss
from sklearn.model_selection import ShuffleSplit

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train_full = pd.read_csv("../Data/Data_train_full_tfidf.csv")
data_train_nostopword = pd.read_csv("../Data/Data_train_nostopword_tfidf.csv")
data_train_nostemstop = pd.read_csv("../Data/Data_train_nostemstop_tfidf.csv")

In [3]:
data_train_trans_full = pd.read_csv("../Data/Data_train_trans_full_tfidf.csv")
data_train_trans_nostopword = pd.read_csv("../Data/Data_train_trans_nostopword_tfidf.csv")
data_train_trans_nostemstop = pd.read_csv("../Data/Data_train_trans_nostemstop_tfidf.csv")

## Features Target Split

In [4]:
X_train_full = data_train_full.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_full = data_train_full[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_nostopword = data_train_nostopword.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_nostopword = data_train_nostopword[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_nostemstop = data_train_nostemstop.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_nostemstop = data_train_nostemstop[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

In [5]:
X_train_trans_full = data_train_trans_full.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_trans_full = data_train_trans_full[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_trans_nostopword = data_train_trans_nostopword.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_trans_nostopword = data_train_trans_nostopword[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_trans_nostemstop = data_train_trans_nostemstop.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_trans_nostemstop = data_train_trans_nostemstop[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

## Cross-Validation for Hyperparameter Tuning

**1. Classifier Chains**

In [9]:
classifier_CC = ClassifierChain(SVC())

**2. Label Powerset**

In [10]:
classifier_LP = LabelPowerset(SVC())

### Random Search Cross Validation

In [11]:
random_grid = {'classifier__C': [.0001, .001, .01],
                'classifier__kernel': ['linear', 'rbf', 'poly'],
                'classifier__gamma': [.0001, .001, .01, .1, 1, 10, 100],
                'classifier__degree': [1, 2, 3, 4, 5],
                'classifier__probability': [True,False]
            }

**1. Classifier Chains**

In [12]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = 0.10, random_state = 8)

random_search_CC = RandomizedSearchCV(estimator=classifier_CC,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring=None,
                                   cv=cv_sets, 
                                   verbose=15,
                                   n_jobs = -1)

In [13]:
random_search_CC.fit(X_train_full, y_train_full)

Fitting 1 folds for each of 10 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 35.6min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 54.6min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 75.5min remaining: 113.2min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 96.3min remaining: 96.3min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 105.5min remaining: 70.3min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 105.5min remaining: 45.2min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed: 149.3min remaining: 37.3min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 174.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 174.1min finished


RandomizedSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.1, train_size=None),
                   error_score='raise-deprecating',
                   estimator=ClassifierChain(classifier=SVC(C=1.0,
                                                            cache_size=200,
                                                            class_weight=None,
                                                            coef0=0.0,
                                                            decision_function_shape='ovr',
                                                            degree=3,
                                                            gamma='auto_deprecated',
                                                            kernel='rbf',
                                                            max_iter=-1,
                                                            probability=False,
                                                            random_state=None,
        

In [14]:
print("Best Params : ",random_search_CC.best_params_)
print()
means = random_search_CC.cv_results_['mean_test_score']
stds = random_search_CC.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, random_search_CC.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__probability': True, 'classifier__kernel': 'linear', 'classifier__gamma': 0.1, 'classifier__degree': 2, 'classifier__C': 0.01}

0.402 (+/-0.000) for {'classifier__probability': True, 'classifier__kernel': 'linear', 'classifier__gamma': 1, 'classifier__degree': 2, 'classifier__C': 0.0001}
0.419 (+/-0.000) for {'classifier__probability': True, 'classifier__kernel': 'linear', 'classifier__gamma': 0.1, 'classifier__degree': 2, 'classifier__C': 0.01}
0.402 (+/-0.000) for {'classifier__probability': False, 'classifier__kernel': 'poly', 'classifier__gamma': 1, 'classifier__degree': 2, 'classifier__C': 0.01}
0.402 (+/-0.000) for {'classifier__probability': True, 'classifier__kernel': 'rbf', 'classifier__gamma': 0.0001, 'classifier__degree': 3, 'classifier__C': 0.001}
0.402 (+/-0.000) for {'classifier__probability': False, 'classifier__kernel': 'poly', 'classifier__gamma': 0.0001, 'classifier__degree': 5, 'classifier__C': 0.01}
0.402 (+/-0.000) for {'classifier__prob

**2. Label Powerset**

In [15]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = 0.10, random_state = 8)

random_search_LP = RandomizedSearchCV(estimator=classifier_LP,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring=None,
                                   cv=cv_sets, 
                                   verbose=15,
                                   n_jobs = -1)

In [16]:
random_search_LP.fit(X_train_full, y_train_full)

Fitting 1 folds for each of 10 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 23.9min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 25.6min remaining: 38.4min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 27.8min remaining: 27.8min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 32.8min remaining: 21.9min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 37.6min remaining: 16.1min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed: 50.4min remaining: 12.6min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 69.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 69.5min finished


RandomizedSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.1, train_size=None),
                   error_score='raise-deprecating',
                   estimator=LabelPowerset(classifier=SVC(C=1.0, cache_size=200,
                                                          class_weight=None,
                                                          coef0=0.0,
                                                          decision_function_shape='ovr',
                                                          degree=3,
                                                          gamma='auto_deprecated',
                                                          kernel='rbf',
                                                          max_iter=-1,
                                                          probability=False,
                                                          random_state=None,
                                                          shrinking=True...
            

In [17]:
print("Best Params : ",random_search_LP.best_params_)
print()
means = random_search_LP.cv_results_['mean_test_score']
stds = random_search_LP.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, random_search_LP.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__probability': True, 'classifier__kernel': 'poly', 'classifier__gamma': 10, 'classifier__degree': 3, 'classifier__C': 0.0001}

0.402 (+/-0.000) for {'classifier__probability': False, 'classifier__kernel': 'rbf', 'classifier__gamma': 1, 'classifier__degree': 2, 'classifier__C': 0.0001}
0.402 (+/-0.000) for {'classifier__probability': True, 'classifier__kernel': 'rbf', 'classifier__gamma': 0.01, 'classifier__degree': 2, 'classifier__C': 0.0001}
0.402 (+/-0.000) for {'classifier__probability': False, 'classifier__kernel': 'poly', 'classifier__gamma': 0.0001, 'classifier__degree': 3, 'classifier__C': 0.01}
0.402 (+/-0.000) for {'classifier__probability': True, 'classifier__kernel': 'linear', 'classifier__gamma': 100, 'classifier__degree': 4, 'classifier__C': 0.001}
0.429 (+/-0.000) for {'classifier__probability': True, 'classifier__kernel': 'poly', 'classifier__gamma': 10, 'classifier__degree': 3, 'classifier__C': 0.0001}
0.402 (+/-0.000) for {'classifier__proba

In [None]:
# classifier_BR.get_params().keys()

### Grid Search Cross Validation

In [22]:
param_grid_CC = {'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1],
             'classifier__degree': [2],
             'classifier__gamma': [0.1],
             'classifier__kernel': ['linear'],
             'classifier__probability': [True]
                }

param_grid_LP = {'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1],
             'classifier__degree': [3],
             'classifier__gamma': [10],
             'classifier__kernel': ['poly'],
             'classifier__probability': [True]
                }

**1. Classifier Chains**

In [23]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = .10, random_state = 8)

grid_search_CC = GridSearchCV(estimator=classifier_CC, 
                           param_grid=param_grid_CC,
                           cv=cv_sets,
                           n_jobs = -1,
                           verbose=15)

grid_search_CC.fit(X_train_full,y_train_full)

Fitting 1 folds for each of 5 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 56.3min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 73.3min remaining: 109.9min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 94.2min remaining: 62.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 95.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 95.9min finished


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.1, train_size=None),
             error_score='raise-deprecating',
             estimator=ClassifierChain(classifier=SVC(C=1.0, cache_size=200,
                                                      class_weight=None,
                                                      coef0=0.0,
                                                      decision_function_shape='ovr',
                                                      degree=3,
                                                      gamma='auto_deprecated',
                                                      kernel='rbf', max_iter=-1,
                                                      probability=False,
                                                      random_state=None,
                                                      shrinking=True, tol=0.001,
                                                      verbose=False),
                                       orde

In [24]:
print("Best Params : ",grid_search_CC.best_params_)
print()
means = grid_search_CC.cv_results_['mean_test_score']
stds = grid_search_CC.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search_CC.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__C': 1, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__probability': True}

0.402 (+/-0.000) for {'classifier__C': 0.0001, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__probability': True}
0.402 (+/-0.000) for {'classifier__C': 0.001, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__probability': True}
0.419 (+/-0.000) for {'classifier__C': 0.01, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__probability': True}
0.611 (+/-0.000) for {'classifier__C': 0.1, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__probability': True}
0.661 (+/-0.000) for {'classifier__C': 1, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__probability': True}


**2. Label Powerset**

In [None]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = .10, random_state = 8)

grid_search_LP = GridSearchCV(estimator=classifier_LP, 
                           param_grid=param_grid_LP,
                           cv=cv_sets,
                           n_jobs = -1,
                           verbose=15)

grid_search_LP.fit(X_train_full,y_train_full)

Fitting 1 folds for each of 5 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 67.5min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 74.0min remaining: 111.0min


In [9]:
print("Best Params : ",grid_search_LP.best_params_)
print()
means = grid_search_LP.cv_results_['mean_test_score']
stds = grid_search_LP.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search_LP.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 800}

0.677 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 600}
0.675 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 700}
0.678 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 800}
0.676 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 900}
0.675 (+/-0.000) for {'classif

## Classification

In [15]:
classifier_CC = grid_search_CC.best_estimator_

In [16]:
classifier_CC

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [10]:
classifier_LP = grid_search_LP.best_estimator_

In [11]:
classifier_LP

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

**1. Classifier Chains**

In [17]:
classifier_CC.fit(X_train_full,y_train_full)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [18]:
# save the model to disk
filename = '../Model/CC_model_full.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [19]:
classifier_CC.fit(X_train_nostopword,y_train_nostopword)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [20]:
# save the model to disk
filename = '../Model/CC_model_nostopword.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [21]:
classifier_CC.fit(X_train_nostemstop,y_train_nostemstop)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [22]:
# save the model to disk
filename = '../Model/CC_model_nostemstop.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [23]:
classifier_CC.fit(X_train_trans_full,y_train_trans_full)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [24]:
# save the model to disk
filename = '../Model/CC_model_trans_full.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [25]:
classifier_CC.fit(X_train_trans_nostopword,y_train_trans_nostopword)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [26]:
# save the model to disk
filename = '../Model/CC_model_trans_nostopword.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [27]:
classifier_CC.fit(X_train_trans_nostemstop,y_train_trans_nostemstop)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [28]:
# save the model to disk
filename = '../Model/CC_model_trans_nostemstop.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

**3. Label Powerset**

In [12]:
classifier_LP.fit(X_train_full,y_train_full)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [13]:
# save the model to disk
filename = '../Model/LP_model_full.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [14]:
classifier_LP.fit(X_train_nostopword,y_train_nostopword)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [15]:
# save the model to disk
filename = '../Model/LP_model_nostopword.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [16]:
classifier_LP.fit(X_train_nostemstop,y_train_nostemstop)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [17]:
# save the model to disk
filename = '../Model/LP_model_nostemstop.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [18]:
classifier_LP.fit(X_train_trans_full,y_train_trans_full)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [19]:
# save the model to disk
filename = '../Model/LP_model_trans_full.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [20]:
classifier_LP.fit(X_train_trans_nostopword,y_train_trans_nostopword)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [21]:
# save the model to disk
filename = '../Model/LP_model_trans_nostopword.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [22]:
classifier_LP.fit(X_train_trans_nostemstop,y_train_trans_nostemstop)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [23]:
# save the model to disk
filename = '../Model/LP_model_trans_nostemstop.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))