In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import get_matrix

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def grid_search_cv(model, parameters, dataset, X):
    
    Y = dataset.label.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)
    
    iterations = 5
    index = ShuffleSplit(n_splits = iterations, test_size = 0.2)
    
    clf = GridSearchCV(model, parameters, cv = index, scoring = 'accuracy')
    clf.fit(X_train, y_train)
    
    best_model = clf.best_params_
    best_score_validation = clf.best_score_

    Y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, Y_pred)
    
    return best_model, best_score_validation, accuracy

In [3]:
dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
dataset

Unnamed: 0,text,label
0,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",1
1,"La palabra ""haiga"", aceptada por la RAE La Rea...",1
2,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,1
3,UNAM capacitará a maestros para aprobar prueba...,0
4,Alerta: pretenden aprobar libros escolares con...,1
...,...,...
2566,"Recuperamos la historia de Aleixandra, la jove...",0
2567,"Reproches, tensión y sinceridad: la comida en ...",0
2568,"RT @ElMundoOpinion: ""PSOE, PP, Ciudadanos y Vo...",0
2569,Rusia cita al embajador español por unas decla...,0


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#### Step 1

In [6]:
matrix_1, df = get_matrix(data = dataset, representation = 'BoW', vocabulary_length = 5000, stemming = False, remove_stopwords = False, language = 'spanish')
matrix_2, df = get_matrix(data = dataset, representation = 'tf-idf', vocabulary_length = 5000, stemming = False, remove_stopwords = False, language = 'spanish')
print(matrix_1.shape)
print(matrix_2.shape)

(2571, 5000)
(2571, 5000)


In [14]:
model_RF = RandomForestClassifier()
model_GBT = GradientBoostingClassifier()
parameters_RF_GBT = {'n_estimators': [50, 100, 200, 300], 
                     'max_features': [50, 100, 200, 300]}

In [25]:
model_SVC = SVC()
parameters_SVC = {'kernel': ['linear', 'rbf'],
                  'C': [1e3, 1, 0.001],
                  'gamma': [0.1, 1]}

In [26]:
model_MLPC = MLPClassifier(activation = 'relu', solver = 'adam')
parameters_MLPC = {'hidden_layer_sizes': [(10),(50),(10,10),(50,50),(10,10,10),(50,50,50)],
                   'max_iter': [1000,1500]}

matrix_1 = BoW; matrix_2 = tf-idf

In [None]:
print('RF')
print('BoW', grid_search_cv(model_RF, parameters_RF_GBT, df, matrix_1))
print('tf-idf', grid_search_cv(model_RF, parameters_RF_GBT, df, matrix_2))
print('GBT')
print('BoW', grid_search_cv(model_GBT, parameters_RF_GBT, df, matrix_1))
print('tf-idf', grid_search_cv(model_GBT, parameters_RF_GBT, df, matrix_2))
print('SVC')
print('BoW', grid_search_cv(model_SVC, parameters_SVC, df, matrix_1))
print('tf-idf', grid_search_cv(model_SVC, parameters_SVC, df, matrix_2))
print('MLPC')
print('BoW', grid_search_cv(model_MLPC, parameters_MLPC, df, matrix_1))
print('tf-idf', grid_search_cv(model_MLPC, parameters_MLPC, df, matrix_2))

#### Step 2

In [20]:
def execute_step_2(model, parameters, representation):
    dictionary = [10e3, 20e3, 30e3, 40e3]
    stemming = [False, True]
    stop_words = [False, True]
    for i in range(np.size(dictionary)):
        for j in range(2):
            for k in range(2):
                matrix, df = get_matrix(dataset, representation = representation, 
                                        vocabulary_length = dictionary[i], stemming = stemming[j], remove_stopwords = stop_words[k],  language = 'spanish')
                print(grid_search_cv(model, parameters, df, matrix))