In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.decomposition import PCA

from data_preprocessing import get_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import plot_utils

#### Upload Dataset

In [None]:
dataset = pd.read_csv('data/Merged/spanish_dataset.csv')
dataset

#### Set Up Data Representation

In [None]:
max_vocab_length = 10000
language = 'spanish'

In [None]:
X, df = get_matrix(data = dataset, vocabulary_length = max_vocab_length, 
                        stemming = True, remove_stopwords = True, language = language)
print(X.shape)

In [None]:
df # dataset with normalized news, without stopwords and with stemming applied

In [None]:
print(X[0]) # tf-idf representation sample

#### Data Split and Grid Search

In [None]:
random_seed = 2 # set random seed for consistency and reproducibility of results
test_size = 0.1 # test set
dev_size = 0.1 # development (validation) set
iterations = 5

In [None]:
Y = dataset.label.values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_seed) # train/test
print('train/test:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
def execute_grid(model, parameters, pca, X_transformed_train):
    
    indexes = ShuffleSplit(n_splits = iterations, test_size = dev_size, random_state = random_seed) # train/dev
    
    acc = make_scorer(accuracy_score)
    scores = {'acc': acc}
    
    clf = GridSearchCV(model, parameters, scoring = scores, cv = indexes, 
                            return_train_score = True, refit = 'acc', verbose = 1)
    if pca:
        clf.fit(X_transformed_train, Y_train)
    else:
        clf.fit(X_train, Y_train) 
    
    outcomes = pd.DataFrame(clf.cv_results_)
    outcomes = outcomes[['params', 'mean_train_acc', 'mean_test_acc', 'std_test_acc']]
    
    return outcomes, clf.best_estimator_, round(clf.best_score_, 3)

#### Models

In [None]:
RF = RandomForestClassifier()
parameters_RF = {'n_estimators': [100, 200, 300, 500], 
                     'max_features': [50, 100, 150]}

SVC = SVC()
parameters_SVC = {'kernel': ['linear', 'rbf'],
                  'C': [1e3, 1, 0.001],
                  'gamma': [0.1, 1]}

MLPC = MLPClassifier(activation = 'relu', solver = 'adam')
parameters_MLPC = {'hidden_layer_sizes': [(10),(50),(10,10),(50,50),(10,10,10),(50,50,50)],
                   'max_iter': [1000,1500]}

models = {RF: parameters_RF,
          SVC: parameters_SVC,
          MLPC: parameters_MLPC}
models

#### Experiments

In [None]:
results = {}

for model, parameters in models.items():
    print(model)
    outcomes, best_estimator, best_score = execute_grid(model, parameters, pca = False, X_transformed_train = None)
    print(outcomes.to_string())
    results[best_estimator] = best_score
    print(best_estimator, best_score)
    print('')

print('Best estimator per model:')
results

In [None]:
best_two = list({k: results[k] for k in sorted(results, key=results.get, reverse=True)})[:2]
best_two

In [None]:
# Reported performance for the two best models

print('Reported performance for the two best models\n')
for model in best_two:
    Y_pred = model.predict(X_test)
    acc = accuracy_score(Y_test, Y_pred)
    print(str(model) + ' : ' + str(round(acc, 3)))

Learning Curves

In [None]:
cv = ShuffleSplit(n_splits = iterations, test_size = dev_size, random_state = random_seed)

In [None]:
estimator = best_two[0]
plot_utils.plot_learning_curve(estimator, estimator.__class__.__name__, X_train, Y_train, 
                   cv = cv, scoring = make_scorer(accuracy_score), ylim = (0,1.1))

In [None]:
estimator = best_two[1]
plot_utils.plot_learning_curve(estimator, estimator.__class__.__name__, X_train, Y_train, 
                   cv = cv, scoring = make_scorer(accuracy_score), ylim = (0,1.1))

#### Experiments with PCA

Cumulative Variance

In [None]:
X_train.toarray().shape

In [None]:
cumulative_var_perc = 0.95

try:
    pca = PCA().fit(X_train.toarray()) # kept all components to evaluate cumulative explained variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    cumulative_variance = map(lambda x: round(x, 2), cumulative_variance) # truncate to 2 floating digits
    n_components = list(cumulative_variance).index(cumulative_var_perc)
    print('Number of components for %.2f cumulative explained variance:'%cumulative_var_perc, n_components)
except np.linalg.LinAlgError:
    print('PCA did not converge. Try again')

Experiments applying PCA

In [None]:
pca = PCA(n_components = n_components)
pca.fit(X_train.toarray()) # Fit with only training data
X_transformed_train = pca.transform(X_train.toarray())
X_transformed_test = pca.transform(X_test.toarray())

In [None]:
results_pca = {}

for model, parameters in models.items():
    print(model)
    outcomes, best_estimator, best_score = execute_grid(model, parameters, pca = True, X_transformed_train = X_transformed_train)
    print(outcomes.to_string())
    results_pca[best_estimator] = best_score
    print(best_estimator, best_score)
    print('')

print('Best estimator per model:\n')
results_pca

In [None]:
best_two_pca = list({k: results_pca[k] for k in sorted(results_pca, key=results_pca.get, reverse=True)})[:2]
best_two_pca

In [None]:
# Reported performance for the two best models with PCA

print('Reported performance for the two best models after applying PCA\n')
for model in best_two_pca:
    Y_pred = model.predict(X_transformed_test) # predict on transformed test data
    acc = accuracy_score(Y_test, Y_pred)
    print(str(model) + ' : ' + str(round(acc, 3)))

Learning Curves

In [None]:
cv = ShuffleSplit(n_splits = iterations, test_size = dev_size, random_state = random_seed)

In [None]:
estimator = best_two_pca[0]
plot_utils.plot_learning_curve(estimator, estimator.__class__.__name__, X_transformed_train, Y_train, 
                   cv = cv, scoring = make_scorer(accuracy_score), ylim = (0,1.1))

In [None]:
estimator = best_two_pca[1]
plot_utils.plot_learning_curve(estimator, estimator.__class__.__name__, X_transformed_train, Y_train, 
                   cv = cv, scoring = make_scorer(accuracy_score), ylim = (0,1.1))