# Example of classifieres performance with hyperparameter tuning:

### Classifiers:
    - Random Forest
    - SGD Classifier
    - SVM

### Parameters:
    -  Tf-idf Vectorization 
    -  Vectorization using (3,5) n-grams of characters.
    -  Variance threshold of 0 to remove reapeted instances. 
    -  5k-fold Cros validation 

### Features:
    - Features were POS tags. 
    - Feature selection of the top 50%.

### Hyper parameters:
    - Grid Search for Random Forest, Randomized Search for the others.

In [1]:
import os
from sklearn.feature_selection import VarianceThreshold
from sklearn import model_selection
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, SelectFpr, f_classif, chi2, mutual_info_classif
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef


# Definicion de funciones

def classificator_score(clasificador, entrenamiento):
    print('Best parameters:\n')
    best_parameters = clasificador.best_estimator_.get_params()
    for param in sorted(best_parameters.keys()):
        print((param, best_parameters[param]))
    predict = clasificador.predict(entrenamiento)
    print('\nMatthews correlation coefficienr:', matthews_corrcoef(y_test, predict))
    print("Confusion Matrix: \n ",confusion_matrix(y_test, predict),'\n\n')
    return predict 
     

posTables = open("../Dataset/TablesPOS.txt").readlines()
posSentenes = open("../Dataset/NoTablesPOS.txt").readlines()
posComplete = posTables + posSentenes


## Split data for train and test the model
class_label = []
for index in range(len(posComplete)):
    if index < len(posTables):
        class_label.append('TABLES')
    else:
        class_label.append('NONE TABLES')


## Split data for train and test the model
print("\n\n#############################################################################################################")
print("\nTraining Classifier...")
X_train, X_test, y_train, y_test = train_test_split(posComplete, class_label, train_size=0.80, test_size=0.20)

# Vectorization 
vectorizer = TfidfVectorizer(analyzer= 'char', ngram_range=(3,5))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

selector = VarianceThreshold()
selector = selector.fit(X_train,X_test)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

percentile_filtter = SelectPercentile(f_classif, percentile=50)
percentile_filtter.fit(X_train, y_train)
X_train = percentile_filtter.transform(X_train)
X_test = percentile_filtter.transform(X_test)


jobs = -1
paramGrid = []
crossV = 5
myScorer = make_scorer(f1_score, average = 'macro')




#############################################################################################################

Training Classifier...


In [2]:
print("\n\n\n----------------------------------- Random Forest -----------------------------------------")


# Random forest
classifier = RandomForestClassifier()
algorithmName = "RandomForest"
paramGrid = {
    'n_estimators': [100, 150,200,300],
    'bootstrap': [True, False],
    'criterion': ["gini", "entropy"],
    'class_weight': ['balanced', None],
}

myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid,cv=crossV, n_jobs=jobs,scoring=myScorer,  verbose = 1)


myClassifier.fit(X_train, y_train)
predict = classificator_score(myClassifier, X_test)
print(classification_report(y_test, predict))




----------------------------------- Random Forest -----------------------------------------
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters:

('bootstrap', False)
('ccp_alpha', 0.0)
('class_weight', 'balanced')
('criterion', 'gini')
('max_depth', None)
('max_features', 'auto')
('max_leaf_nodes', None)
('max_samples', None)
('min_impurity_decrease', 0.0)
('min_impurity_split', None)
('min_samples_leaf', 1)
('min_samples_split', 2)
('min_weight_fraction_leaf', 0.0)
('n_estimators', 200)
('n_jobs', None)
('oob_score', False)
('random_state', None)
('verbose', 0)
('warm_start', False)

Matthews correlation coefficienr: 0.7967037381979432
Confusion Matrix: 
  [[9091    6]
 [  19   48]] 


              precision    recall  f1-score   support

 NONE TABLES       1.00      1.00      1.00      9097
      TABLES       0.89      0.72      0.79        67

    accuracy                           1.00      9164
   macro avg       0.94      0.86      0.90      9164
wei

In [3]:
print("\n\n\n----------------------------------- SGDClassifier  -----------------------------------------")

classifier = SGDClassifier(loss = 'log')
algorithmName = "SGDClassifier"
paramGrid = {'alpha' : [10**(-x) for x in range(7)],
            'penalty' : ['elasticnet', 'l1', 'l2'],
            'l1_ratio' : [0.15, 0.25, 0.5, 0.75],
            'class_weight': ['balanced', None],}
        
myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid,cv=crossV,n_iter=100, n_jobs=jobs,scoring=myScorer,  verbose = 1)


myClassifier.fit(X_train, y_train)
predict = classificator_score(myClassifier, X_test)
print(classification_report(y_test, predict))





----------------------------------- SGDClassifier  -----------------------------------------
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters:

('alpha', 1e-06)
('average', False)
('class_weight', None)
('early_stopping', False)
('epsilon', 0.1)
('eta0', 0.0)
('fit_intercept', True)
('l1_ratio', 0.15)
('learning_rate', 'optimal')
('loss', 'log')
('max_iter', 1000)
('n_iter_no_change', 5)
('n_jobs', None)
('penalty', 'l1')
('power_t', 0.5)
('random_state', None)
('shuffle', True)
('tol', 0.001)
('validation_fraction', 0.1)
('verbose', 0)
('warm_start', False)

Matthews correlation coefficienr: 0.8496470051632571
Confusion Matrix: 
  [[9087   10]
 [  10   57]] 


              precision    recall  f1-score   support

 NONE TABLES       1.00      1.00      1.00      9097
      TABLES       0.85      0.85      0.85        67

    accuracy                           1.00      9164
   macro avg       0.92      0.92      0.92      9164
weighted avg       1.00  

In [7]:
print("\n\n\n-------------------- Radial Basis Function Support Vector Machine  --------------------")
classifier = SVC()
paramGrid = {'C': np.arange(1,50,0.5),
                         'gamma': np.arange(0,1,0.1),
                         'kernel': ['rbf','linear'], 'class_weight': ['balanced', None],}


# The number of iterations was reduced in this example in order to save computational time, 
RS = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=20,cv=crossV, n_jobs=-1, scoring=myScorer, verbose = 1)
RS.fit(X_train, y_train)
predict = classificator_score(RS, X_test)
print(classification_report(y_test, predict))




-------------------- Radial Basis Function Support Vector Machine  --------------------
Best parameters:

('C', 15.5)
('break_ties', False)
('cache_size', 200)
('class_weight', None)
('coef0', 0.0)
('decision_function_shape', 'ovr')
('degree', 3)
('gamma', 0.6000000000000001)
('kernel', 'rbf')
('max_iter', -1)
('probability', False)
('random_state', None)
('shrinking', True)
('tol', 0.001)
('verbose', False)

Matthews correlation coefficienr: 0.8322214781563322
Confusion Matrix: 
  [[9087   10]
 [  12   55]] 


              precision    recall  f1-score   support

 NONE TABLES       1.00      1.00      1.00      9097
      TABLES       0.85      0.82      0.83        67

    accuracy                           1.00      9164
   macro avg       0.92      0.91      0.92      9164
weighted avg       1.00      1.00      1.00      9164



# Example of classifieres performance without hyperparameter tuning. 

### Classifiers:
    - Random Forest
    - SGD Classifier
    - SVM

### Parameters:
    -  Tf-idf Vectorization 
    -  Vectorization using (3,5) n-grams of characters.
    -  Variance threshold of 0 to remove reapeted instances. 
    -  5k-fold Cros validation 

### Features:
    - Features were POS tags. 
    - Feature selection of the top 50%.

### Hyper parameters:
    - Grid Search for Random Forest, Randomized Search for the others.

In [8]:

posTables = open("../Dataset/TablesPOS.txt").readlines()
posSentenes = open("../Dataset/NoTablesPOS.txt").readlines()
posComplete = posTables + posSentenes


## Split data for train and test the model
class_label = []
for index in range(len(posComplete)):
    if index < len(posTables):
        class_label.append('TABLES')
    else:
        class_label.append('NONE TABLES')


## Split data for train and test the model
print("\n\n#############################################################################################################")
print("\nTraining Classifier...")
X_train, X_test, y_train, y_test = train_test_split(posComplete, class_label, train_size=0.80, test_size=0.20)

# Vectorization 
vectorizer = TfidfVectorizer(analyzer= 'char', ngram_range=(3,5))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

selector = VarianceThreshold()
selector = selector.fit(X_train,X_test)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

percentile_filtter = SelectPercentile(f_classif, percentile=50)
percentile_filtter.fit(X_train, y_train)
X_train = percentile_filtter.transform(X_train)
X_test = percentile_filtter.transform(X_test)





#############################################################################################################

Training Classifier...


In [9]:

print("\n\n\n----------------------------------- Random Forest -----------------------------------------")

myClassifier = RandomForestClassifier()


myClassifier.fit(X_train, y_train)
predict =  myClassifier.predict(X_test)
print('Matthews correlation coefficienr:', matthews_corrcoef(y_test, predict))
print("Confusion Matrix: \n ",confusion_matrix(y_test, predict),'\n\n')
print(classification_report(y_test, predict))




----------------------------------- Random Forest -----------------------------------------
Matthews correlation coefficienr: 0.727245112567473
Confusion Matrix: 
  [[9076    3]
 [  37   48]] 


              precision    recall  f1-score   support

 NONE TABLES       1.00      1.00      1.00      9079
      TABLES       0.94      0.56      0.71        85

    accuracy                           1.00      9164
   macro avg       0.97      0.78      0.85      9164
weighted avg       1.00      1.00      1.00      9164



In [10]:
print("\n\n\n----------------------------------- SGDClassifier Grid Search -----------------------------------------")

myClassifier = SGDClassifier()


myClassifier.fit(X_train, y_train)
predict =   myClassifier.predict(X_test)
print('Matthews correlation coefficienr:', matthews_corrcoef(y_test, predict))
print("Confusion Matrix: \n ",confusion_matrix(y_test, predict),'\n\n')
print(classification_report(y_test, predict))




----------------------------------- SGDClassifier Grid Search -----------------------------------------
Matthews correlation coefficienr: 0.45404523650383277
Confusion Matrix: 
  [[9065   14]
 [  58   27]] 


              precision    recall  f1-score   support

 NONE TABLES       0.99      1.00      1.00      9079
      TABLES       0.66      0.32      0.43        85

    accuracy                           0.99      9164
   macro avg       0.83      0.66      0.71      9164
weighted avg       0.99      0.99      0.99      9164



In [11]:
print("\n\n\n-------------------- Radial Basis Function Support Vector Machine  --------------------")

myClassifier = SVC()

myClassifier.fit(X_train, y_train)
predict =  myClassifier.predict(X_test)
print('Matthews correlation coefficienr:', matthews_corrcoef(y_test, predict))
print("Confusion Matrix: \n ",confusion_matrix(y_test, predict),'\n\n')
print(classification_report(y_test, predict))




-------------------- Radial Basis Function Support Vector Machine  --------------------
Matthews correlation coefficienr: 0.7236913652398742
Confusion Matrix: 
  [[9065   14]
 [  29   56]] 


              precision    recall  f1-score   support

 NONE TABLES       1.00      1.00      1.00      9079
      TABLES       0.80      0.66      0.72        85

    accuracy                           1.00      9164
   macro avg       0.90      0.83      0.86      9164
weighted avg       0.99      1.00      1.00      9164

