In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import get_matrix

[nltk_data] Downloading package punkt to /Users/peliculas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peliculas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def grid_search_cv(model, parameters, dataset, X):
    
    Y = dataset.label.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)
    
    iterations = 5
    index = ShuffleSplit(n_splits = iterations, test_size = 0.2)
    
    clf = GridSearchCV(model, parameters, cv = index, scoring = 'accuracy')
    clf.fit(X_train, y_train)
    
    best_model = clf.best_params_
    best_score_validation = clf.best_score_

    Y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, Y_pred)
    
    return best_model, round(best_score_validation, 3), round(accuracy, 3)

In [6]:
dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
dataset

Unnamed: 0,text,label
0,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",1
1,"La palabra ""haiga"", aceptada por la RAE La Rea...",1
2,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,1
3,UNAM capacitará a maestros para aprobar prueba...,0
4,Alerta: pretenden aprobar libros escolares con...,1
5,Un paso más cerca de hacer los exámenes 'onlin...,0
6,UNAM REALIZARÁ PRUEBAS ANTIDOPING A ESTUDIANTE...,1
7,Niño de *NUMBER* años se prepara para entrar a...,0
8,*NUMBER* palabras que creíamos inaceptables y ...,0
9,LIMITARÁN EL TIEMPO DE EGRESO EN FILOSOFÍA Y S...,1


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#### Step 1

In [8]:
matrix_1, df = get_matrix(data = dataset, representation = 'BoW', vocabulary_length = 5000, stemming = False, remove_stopwords = False, language = 'spanish')
matrix_2, df = get_matrix(data = dataset, representation = 'tf-idf', vocabulary_length = 5000, stemming = False, remove_stopwords = False, language = 'spanish')
print(matrix_1.shape)
print(matrix_2.shape)

(2571, 5000)
(2571, 5000)


In [9]:
model_RF = RandomForestClassifier()
model_GBT = GradientBoostingClassifier()
parameters_RF_GBT = {'n_estimators': [50, 100, 200, 300, 500], 
                     'max_features': [50, 100, 200, 300]}

In [10]:
model_SVC = SVC()
parameters_SVC = {'kernel': ['linear', 'rbf'],
                  'C': [1e3, 1, 0.001],
                  'gamma': [0.1, 1]}

In [11]:
model_MLPC = MLPClassifier(activation = 'relu', solver = 'adam')
parameters_MLPC = {'hidden_layer_sizes': [(10),(50),(10,10),(50,50),(10,10,10),(50,50,50)],
                   'max_iter': [1000,1500]}

matrix_1 = BoW; matrix_2 = tf-idf

In [9]:
print('RF')
print('BoW', grid_search_cv(model_RF, parameters_RF_GBT, df, matrix_1))
print('tf-idf', grid_search_cv(model_RF, parameters_RF_GBT, df, matrix_2))
print('GBT')
print('BoW', grid_search_cv(model_GBT, parameters_RF_GBT, df, matrix_1))
print('tf-idf', grid_search_cv(model_GBT, parameters_RF_GBT, df, matrix_2))
print('SVC')
print('BoW', grid_search_cv(model_SVC, parameters_SVC, df, matrix_1))
print('tf-idf', grid_search_cv(model_SVC, parameters_SVC, df, matrix_2))
print('MLPC')
print('BoW', grid_search_cv(model_MLPC, parameters_MLPC, df, matrix_1))
print('tf-idf', grid_search_cv(model_MLPC, parameters_MLPC, df, matrix_2))

RF
BoW ({'max_features': 100, 'n_estimators': 500}, 0.75, 0.788)
tf-idf ({'max_features': 50, 'n_estimators': 200}, 0.748, 0.75)
GBT
BoW ({'max_features': 50, 'n_estimators': 500}, 0.755, 0.753)
tf-idf ({'max_features': 100, 'n_estimators': 300}, 0.733, 0.775)
SVC
BoW ({'C': 1, 'gamma': 0.1, 'kernel': 'linear'}, 0.72, 0.732)
tf-idf ({'C': 1000.0, 'gamma': 1, 'kernel': 'rbf'}, 0.76, 0.769)
MLPC
BoW ({'hidden_layer_sizes': (10, 10, 10), 'max_iter': 1500}, 0.735, 0.713)
tf-idf ({'hidden_layer_sizes': (50, 50, 50), 'max_iter': 1500}, 0.729, 0.757)


#### Step 2

In [12]:
def execute_step_2(model, parameters, representation):
    dictionary = [10000, 20000, 30000, 40000]
    stemming = [False, True]
    stop_words = [False, True]
    for i in range(np.size(dictionary)):
        for j in range(2):
            for k in range(2):
                matrix, df = get_matrix(dataset, representation = representation, 
                                        vocabulary_length = dictionary[i], stemming = stemming[j], remove_stopwords = stop_words[k],  language = 'spanish')
                print(matrix.shape)
                best_model, best_score_validation, accuracy = grid_search_cv(model, parameters, df, matrix)
                print(dictionary[i], 'Stemming: ' + str(stemming[j]), 'Remove_StopWords: ' + str(stop_words[k]), best_score_validation, accuracy)

In [13]:
# Complete according with the above results
parameters_RF = {'n_estimators': [500], 
                     'max_features': [100]}
parameters_GBT = {'n_estimators': [300], 
                     'max_features': [100]}
parameters_SVC = {'kernel': ['rbf'],
                  'C': [1e3],
                  'gamma': [1]}
parameters_MLPC = {'hidden_layer_sizes': [(50,50,50)],
                   'max_iter': [1500]}

In [14]:
print('RF')
execute_step_2(model_RF, parameters_RF, 'BoW')
print('GBT')
execute_step_2(model_GBT, parameters_GBT, 'tf-idf')
print('SVC')
execute_step_2(model_SVC, parameters_SVC, 'tf-idf')
print('MLPC')
execute_step_2(model_MLPC, parameters_MLPC, 'tf-idf')

RF
(2571, 10000)
10000 Stemming: False Remove_StopWords: False 0.76 0.744
(2571, 10000)
10000 Stemming: False Remove_StopWords: True 0.764 0.715
(2571, 10000)
10000 Stemming: True Remove_StopWords: False 0.781 0.765
(2571, 10000)
10000 Stemming: True Remove_StopWords: True 0.755 0.761
(2571, 20000)
20000 Stemming: False Remove_StopWords: False 0.766 0.79
(2571, 20000)
20000 Stemming: False Remove_StopWords: True 0.76 0.792
(2571, 18194)
20000 Stemming: True Remove_StopWords: False 0.78 0.812
(2571, 18013)
20000 Stemming: True Remove_StopWords: True 0.762 0.771
(2571, 30000)
30000 Stemming: False Remove_StopWords: False 0.786 0.794
(2571, 30000)
30000 Stemming: False Remove_StopWords: True 0.756 0.767
(2571, 18194)
30000 Stemming: True Remove_StopWords: False 0.781 0.761
(2571, 18013)
30000 Stemming: True Remove_StopWords: True 0.767 0.765
(2571, 35336)
40000 Stemming: False Remove_StopWords: False 0.764 0.783
(2571, 35172)
40000 Stemming: False Remove_StopWords: True 0.743 0.804
(2571,