In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import get_matrix

In [None]:
def grid_search_cv(model, parameters, dataset, X):
    
    Y = dataset.label.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)
    
    iterations = 5
    index = ShuffleSplit(n_splits = iterations, test_size = 0.2)
    
    clf = GridSearchCV(model, parameters, cv = index, scoring = 'accuracy')
    clf.fit(X_train, y_train)
    
    best_model = clf.best_params_
    best_score_validation = clf.best_score_

    Y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, Y_pred)
    
    return best_model, round(best_score_validation, 3), round(accuracy, 3)

In [None]:
dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
dataset

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#### Step 1

In [None]:
matrix_1, df = get_matrix(data = dataset, representation = 'BoW', vocabulary_length = 5000, stemming = False, remove_stopwords = False, language = 'spanish')
matrix_2, df = get_matrix(data = dataset, representation = 'tf-idf', vocabulary_length = 5000, stemming = False, remove_stopwords = False, language = 'spanish')
print(matrix_1.shape)
print(matrix_2.shape)

In [None]:
model_RF = RandomForestClassifier()
model_GBT = GradientBoostingClassifier()
parameters_RF_GBT = {'n_estimators': [50, 100, 200, 300, 500], 
                     'max_features': [50, 100, 200, 300]}

In [None]:
model_SVC = SVC()
parameters_SVC = {'kernel': ['linear', 'rbf'],
                  'C': [1e3, 1, 0.001],
                  'gamma': [0.1, 1]}

In [None]:
model_MLPC = MLPClassifier(activation = 'relu', solver = 'adam')
parameters_MLPC = {'hidden_layer_sizes': [(10),(50),(10,10),(50,50),(10,10,10),(50,50,50)],
                   'max_iter': [1000,1500]}

matrix_1 = BoW; matrix_2 = tf-idf

In [None]:
print('RF')
print('BoW', grid_search_cv(model_RF, parameters_RF_GBT, df, matrix_1))
print('tf-idf', grid_search_cv(model_RF, parameters_RF_GBT, df, matrix_2))
print('GBT')
print('BoW', grid_search_cv(model_GBT, parameters_RF_GBT, df, matrix_1))
print('tf-idf', grid_search_cv(model_GBT, parameters_RF_GBT, df, matrix_2))
print('SVC')
print('BoW', grid_search_cv(model_SVC, parameters_SVC, df, matrix_1))
print('tf-idf', grid_search_cv(model_SVC, parameters_SVC, df, matrix_2))
print('MLPC')
print('BoW', grid_search_cv(model_MLPC, parameters_MLPC, df, matrix_1))
print('tf-idf', grid_search_cv(model_MLPC, parameters_MLPC, df, matrix_2))

#### Step 2

In [None]:
def execute_step_2(model, parameters, representation):
    dictionary = [10000, 20000, 30000, 40000]
    stemming = [False, True]
    stop_words = [False, True]
    for i in range(np.size(dictionary)):
        for j in range(2):
            for k in range(2):
                matrix, df = get_matrix(dataset, representation = representation, 
                                        vocabulary_length = dictionary[i], stemming = stemming[j], remove_stopwords = stop_words[k],  language = 'spanish')
                print(matrix.shape)
                best_model, best_score_validation, accuracy = grid_search_cv(model, parameters, df, matrix)
                print(dictionary[i], 'Stemming: ' + str(stemming[j]), 'Remove StopWords: ' + str(stop_words[k]), best_score_validation, accuracy)

In [None]:
# Complete according with the above results
parameters_RF = {'n_estimators': [], 
                     'max_features': []}
parameters_GBT = {'n_estimators': [], 
                     'max_features': []}
parameters_SVC = {'kernel': [],
                  'C': [],
                  'gamma': []}
parameters_MLPC = {'hidden_layer_sizes': [],
                   'max_iter': []}

In [None]:
print('RF')
#execute_step_2(model_RF, parameters_RF, 'BoW')
execute_step_2(model_RF, parameters_RF)
print('GBT')
execute_step_2(model_GBT, parameters_GBT)
print('SVC')
execute_step_2(model_SVC, parameters_SVC)
print('MLPC')
execute_step_2(model_MLPC, parameters_MLPC)