In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
from sklearn import model_selection
from sklearn.metrics import matthews_corrcoef
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer,classification_report



def classificator_score(clasificador, entrenamiento,y_test):
    print('Best parameters:\n')
    best_parameters = clasificador.best_estimator_.get_params()
    for param in sorted(best_parameters.keys()):
        print((param, best_parameters[param]))
    prediction = clasificador.predict(entrenamiento)
    matew = matthews_corrcoef(y_test, prediction)
    print('Matthews correlation coefficienr:', matew)
    #print("Confusion Matrix: \n ", confusion_matrix(y_test, prediction))
    return prediction

def preporcessing_data(type_analyzer,ngram_size,feature_chosed):
    postables_local = open("../dataset/TablesPOS.txt").readlines()
    possentenes_local = open("../dataset/NoTablesPOS.txt").readlines()
    poscomplete_local = postables_local + possentenes_local

    if feature_chosed != ['all']:
        new_sentence = []
        for sentence_num in range(len(poscomplete_local)):
            tokens = poscomplete_local[sentence_num].split(' ')
            new_words = []
            for word_num in tokens:
                if word_num in feature_chosed:
                    new_words.append(word_num)
            new_sentence.append(' '.join(new_words))
    else:
        new_sentence = poscomplete_local


    class_label_local = []
    for i in range(len(new_sentence)):
        if i < len(postables_local):
            class_label_local.append('TABLES')
        else:
            class_label_local.append('NONE TABLES')

    fun_x_train, fun_x_test, fun_y_train, fun_y_test = train_test_split(new_sentence, class_label_local, train_size=0.80, test_size=0.20)

    tdifvectorizer = TfidfVectorizer(analyzer = type_analyzer,ngram_range=ngram_size)
    x_train_fun = tdifvectorizer.fit_transform(fun_x_train)
    x_test_fun = tdifvectorizer.transform(fun_x_test)

    variance_selector = VarianceThreshold()
    variance_selector = variance_selector.fit(x_train_fun,x_test_fun)
    x_train_fun = variance_selector.transform(x_train_fun)
    x_test_fun = variance_selector.transform(x_test_fun)

    return  x_train_fun, x_test_fun, fun_y_train, fun_y_test

def run_classifier_grid(x_train, x_test, y_train, y_test ,scorer_fun):
    f1scores = []
    mathewscores = []
    print("\n----------------------------------- Random Forest -----------------------------------------")
    rand_forest_classifier = RandomForestClassifier()
    rand_forest_param_grid = {'n_estimators': [100, 150,200,300],
                            'bootstrap': [True, False],
                            'criterion': ["gini", "entropy"],
                            'class_weight': ['balanced', None]}

    random_forest = model_selection.RandomizedSearchCV(rand_forest_classifier, rand_forest_param_grid,cv=crossV, n_jobs=jobs,scoring=scorer_fun, verbose = 0)
    random_forest.fit(x_train, y_train)
    rand_forest_predict = classificator_score(random_forest, x_test,y_test)
    score = f1_score(y_test, rand_forest_predict, pos_label='TABLES', average='binary')
    matw = matthews_corrcoef(y_test, rand_forest_predict)
    f1scores.append(score)
    mathewscores.append(matw)
    print(classification_report(y_test, rand_forest_predict))


    print("\n----------------------------------- SGDClassifier  -----------------------------------------")
    sgdc_classifier = SGDClassifier(loss = 'log_loss')
    sgdc_param_grid = {'alpha' : [10**(-x) for x in range(7)],
                        'penalty' : ['elasticnet', 'l1', 'l2'],
                        'l1_ratio' : [0.15, 0.25, 0.5, 0.75],
                        'class_weight': ['balanced', None],}

    sgdc_model = model_selection.RandomizedSearchCV(sgdc_classifier, sgdc_param_grid,cv=crossV,n_iter=50, n_jobs=jobs,scoring=scorer_fun,  verbose = 0)
    sgdc_model.fit(x_train, y_train)
    sgdc_predict = classificator_score(sgdc_model, x_test, y_test)
    score = f1_score(y_test, sgdc_predict, pos_label='TABLES', average='binary')
    matw = matthews_corrcoef(y_test, sgdc_predict)
    f1scores.append(score)
    mathewscores.append(matw)
    print(classification_report(y_test, sgdc_predict))

    print("\n-------------------- Radial Basis Function Support Vector Machine  --------------------")
    svm = SVC()
    svm_param_grid = {'C': np.arange(1,50,0.5),
                             'gamma': np.arange(0.01,1.01,0.1),
                             'kernel': ['rbf'], 'class_weight': ['balanced', None],}

    # The number of iterations was reduced in this example in order to save computational time,
    svm_rbf = model_selection.RandomizedSearchCV(svm, svm_param_grid, n_iter=70,cv=crossV, n_jobs=-1, scoring=scorer_fun, verbose = 0)
    svm_rbf.fit(x_train, y_train)
    svm_predict = classificator_score(svm_rbf, x_test, y_test)
    score = f1_score(y_test, svm_predict, pos_label='TABLES', average='binary')
    matw = matthews_corrcoef(y_test, svm_predict)
    f1scores.append(score)
    mathewscores.append(matw)
    print(classification_report(y_test, svm_predict))
    return f1scores, mathewscores

run = 0
jobs = -1
crossV = 5
all_matew = {}
all_scores = {}

vectorization = ['word','char']
feature_selection =  ['None',70,90]
type_scorer = ['weighted', 'binary']
n_gram_range = {'word':[(1,1),(2,2),(3,3),(1,2),(2,3),(1,3)],'char':[(2,3),(2,4),(2,5),(3,4),(3,5)]}
feature_type = {'all_pos':['all'],
                'symbols': [',', '.', ':', 'LRB', 'RRB', 'LCB', 'RCB', 'SYM', 'HYPH', 'NFP'],
                'numbers':['CD'],
                'symbols_numbers': [',', '.', ':', 'LRB', 'RRB', 'LCB', 'RCB', 'SYM', 'HYPH', 'NFP','CD']}


for analyzer in vectorization:
    for ngram in n_gram_range[analyzer]:
        for scorer in type_scorer:

            if scorer == 'weighted':
                run_scorer = make_scorer(f1_score, average='weighted')
            else:
                run_scorer = make_scorer(f1_score, labels=['NONE TABLES','TABLES'],
                                         average = 'binary',pos_label = 'TABLES')

            for feature in feature_type.keys():
                which_features = feature_type[feature]
                x_train_ds, x_test_ds, y_train_labels, y_test_labels = preporcessing_data(analyzer,ngram,which_features)

                for selection in feature_selection:
                    run += 1
                    if (selection != 'None') and (feature == 'all_pos'):
                        selector = SelectPercentile(chi2,percentile=selection)
                        x_train_ds = selector.fit_transform(x_train_ds,y_train_labels)
                        x_test_ds = selector.transform(x_test_ds)


                    score_f1,score_mathew = run_classifier_grid(x_train_ds, x_test_ds, y_train_labels, y_test_labels, run_scorer)
                    print(f'Run #{run}')
                    print(f'F1 Score: {score_f1}')
                    print(f'Matthews Correlation: {score_mathew}')
                    print('Vectorization: ',analyzer,'\nN-gram range: ',ngram,'\nScorer: ',scorer,'\nSelected features',feature,'\nFeature selection',selection,'\nOversampling: None')
                    print('########################################################\n\n')
                    all_scores[f'Run{run}_f1'] = score_f1
                    all_matew[f'Run{run}_Matthew'] = score_mathew

                    if scorer != 'weighted':
                        run += 1
                        oversampling = RandomOverSampler()
                        x_train_ds,y_train_labels = oversampling.fit_resample(x_train_ds,y_train_labels)
                        score_f1,score_mathew = run_classifier_grid(x_train_ds, x_test_ds, y_train_labels, y_test_labels, run_scorer)
                        print('Vectorization: ',analyzer,'\nN-gram range: ',ngram,'\nScorer: ',scorer,'\nSelected features',feature,'\nFeature selection',selection,'\nOversampling: Yes')
                        print('########################################################\n\n')
                        all_scores[f'Run{run}_f1'] = score_f1
                        all_matew[f'Run{run}_Matthew'] = score_mathew


mathew = pd.DataFrame.from_dict(all_matew)
f1 = pd.DataFrame.from_dict(all_scores)

f1.to_csv('../f1_table')
mathew.to_csv('../matthew_table')