In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from data_preprocessing import get_matrix

In [32]:
def grid_search_cv(model, parameters, train_matrix, Y_train, test_matrix, Y_test):
    
    #scaler = StandardScaler().fit(train_matrix)
    #X_train = scaler.transform(train_matrix)
    
    iterations = 5
    shuffle_split = ShuffleSplit(n_splits=iterations, train_size=.75)
    index = shuffle_split.split(train_matrix, Y_train)
    
    clf = GridSearchCV(model, parameters, cv = index, scoring = 'accuracy', verbose=12)
    clf.fit(train_matrix, Y_train)
    
    best_model = clf.best_params_
    best_score_validation = clf.best_score_
    
    #scaler = StandardScaler().fit(test_matrix)
    #X_test = scaler.transform(test_matrix)

    Y_pred = clf.predict(test_matrix)
    accuracy = accuracy_score(Y_test, Y_pred)
    
    return best_model, best_score_validation, accuracy

In [5]:
train = pd.read_csv('../data/Merged/train.csv')
train

Unnamed: 0,text,label
0,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",1
1,"La palabra ""haiga"", aceptada por la RAE La Rea...",1
2,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,1
3,UNAM capacitará a maestros para aprobar prueba...,0
4,Alerta: pretenden aprobar libros escolares con...,1
...,...,...
2271,"Recuperamos la historia de Aleixandra, la jove...",0
2272,"Reproches, tensión y sinceridad: la comida en ...",0
2273,"RT @ElMundoOpinion: ""PSOE, PP, Ciudadanos y Vo...",0
2274,Rusia cita al embajador español por unas decla...,0


In [6]:
test = pd.read_csv('../data/Merged/test.csv')
test

Unnamed: 0,text,label
0,MAESTRA DE *NUMBER* AÑOS QUE TUVO RELACIONES C...,1
1,Oxford lanza sus propios exámenes de certifica...,0
2,La RAE estudia incluir «machirulo» en el Dicci...,0
3,Malala Yousafzai anuncia que estudiará en Oxfo...,0
4,Nombran a Ricardo Arjona nuevo miembro de la R...,1
...,...,...
290,Meryl Streep disfrutó unos premios Oscar tan m...,0
291,EL PLAGIO DE LANA DEL REY A RADIOHEAD FUE ACOR...,1
292,Ricardo Arjona lanza una serie documental por ...,0
293,Raúl Araiza sorprende a Andrea Legarreta con b...,0


In [34]:
train_matrix, df_train = get_matrix(train, 'BoW', vocabulary_length = 3000, stop_words_flag = True, language = 'spanish')
print(train_matrix.shape)

(2276, 3000)


In [35]:
test_matrix, df_train = get_matrix(test, 'BoW', vocabulary_length = 3000, stop_words_flag = True, language = 'spanish')
print(test_matrix.shape)

(295, 3000)


In [22]:
Y_train = train.label.tolist()
print(np.size(Y_train))
Y_test = test.label.tolist()
print(np.size(Y_test))

2276
295


### Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
parameters = {'n_estimators': [5,10,20,50,100],
             'max_features': [5,10,50,100]}

In [37]:
grid_search_cv(model, parameters, train_matrix, Y_train, test_matrix, Y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] max_features=5, n_estimators=5 ..................................
[CV] ...... max_features=5, n_estimators=5, score=0.610, total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV] ...... max_features=5, n_estimators=5, score=0.654, total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV] ...... max_features=5, n_estimators=5, score=0.591, total=   0.1s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s



[CV] max_features=5, n_estimators=5 ..................................
[CV] ...... max_features=5, n_estimators=5, score=0.636, total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV] ...... max_features=5, n_estimators=5, score=0.634, total=   0.1s
[CV] max_features=5, n_estimators=10 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.4s remaining:    0.0s


[CV] ..... max_features=5, n_estimators=10, score=0.640, total=   0.1s
[CV] max_features=5, n_estimators=10 .................................
[CV] ..... max_features=5, n_estimators=10, score=0.666, total=   0.1s
[CV] max_features=5, n_estimators=10 .................................
[CV] ..... max_features=5, n_estimators=10, score=0.659, total=   0.1s

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.6s remaining:    0.0s



[CV] max_features=5, n_estimators=10 .................................
[CV] ..... max_features=5, n_estimators=10, score=0.668, total=   0.1s
[CV] max_features=5, n_estimators=10 .................................
[CV] ..... max_features=5, n_estimators=10, score=0.638, total=   0.1s
[CV] max_features=5, n_estimators=20 .................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.9s remaining:    0.0s


[CV] ..... max_features=5, n_estimators=20, score=0.703, total=   0.2s
[CV] max_features=5, n_estimators=20 .................................
[CV] ..... max_features=5, n_estimators=20, score=0.670, total=   0.2s
[CV] max_features=5, n_estimators=20 .................................
[CV] ..... max_features=5, n_estimators=20, score=0.671, total=   0.2s
[CV] max_features=5, n_estimators=20 .................................
[CV] ..... max_features=5, n_estimators=20, score=0.677, total=   0.2s
[CV] max_features=5, n_estimators=20 .................................
[CV] ..... max_features=5, n_estimators=20, score=0.663, total=   0.3s
[CV] max_features=5, n_estimators=50 .................................
[CV] ..... max_features=5, n_estimators=50, score=0.724, total=   0.5s
[CV] max_features=5, n_estimators=50 .................................
[CV] ..... max_features=5, n_estimators=50, score=0.740, total=   0.5s
[CV] max_features=5, n_estimators=50 .................................
[CV] .

[CV] .... max_features=50, n_estimators=50, score=0.733, total=   0.6s
[CV] max_features=50, n_estimators=50 ................................
[CV] .... max_features=50, n_estimators=50, score=0.701, total=   0.5s
[CV] max_features=50, n_estimators=100 ...............................
[CV] ... max_features=50, n_estimators=100, score=0.757, total=   1.1s
[CV] max_features=50, n_estimators=100 ...............................
[CV] ... max_features=50, n_estimators=100, score=0.736, total=   1.1s
[CV] max_features=50, n_estimators=100 ...............................
[CV] ... max_features=50, n_estimators=100, score=0.731, total=   1.1s
[CV] max_features=50, n_estimators=100 ...............................
[CV] ... max_features=50, n_estimators=100, score=0.763, total=   1.1s
[CV] max_features=50, n_estimators=100 ...............................
[CV] ... max_features=50, n_estimators=100, score=0.731, total=   1.5s
[CV] max_features=100, n_estimators=5 ................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   45.0s finished


({'max_features': 50, 'n_estimators': 100},
 0.7437609841827768,
 0.5457627118644067)

Model