In [1]:
%matplotlib inline 
import re
import nltk
import numpy as np
import pandas as pd
from time import time
from sklearn import metrics
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from sklearn.model_selection import KFold
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer

def clean_corpus(corpus):
  xcorpus = corpus.get_values()
  for i in range(len(corpus)):
    xcorpus[i] = re.sub("[^a-zA-Z]", " ", corpus[i].lower())
    xcorpus[i] = ' '.join(xcorpus[i].split())
  return xcorpus

snowball = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [snowball.stem(word) for word in tokenizer.tokenize(text.lower())]

def split_into_lemmas(text):
    text = unicode(text, 'utf-8').lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

data = pd.read_csv("/home/jluis2/datosbi.csv", header=0, delimiter="\t", quoting=3 )
x = data['review']
y = data.label
corpus = clean_corpus(x)

vectorizer = TfidfVectorizer(tokenizer=tokenize, sublinear_tf=True, norm='l2', ngram_range=(1, 2), \
                       max_features=30000, min_df=5, stop_words='english', use_idf=True)
X = vectorizer.fit_transform(corpus)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [2]:
def benchmark(i, clf):
    kf = KFold(n_splits=10, shuffle=True)
    t0 = time()
    scores = cross_val_score(clf, X, y, cv=kf)
    train_time = time() - t0
    avg_score = np.mean(scores)
    minv = np.ndarray.min(scores)
    maxv = np.ndarray.max(scores)
    mse = cross_val_score(clf, X, y, cv=kf,  scoring='neg_mean_squared_error')
    avg_mse = np.mean(mse) * -1
    print('=' * 80)
    print(clf)
    print("{0}. AVG: {1:.4f}, MIN: {2:.4f}, MAX: {3:.4f}, TIME: {4:.3f}, AVGMSE: {5:.4f}".format(i, avg_score, \
        minv, maxv, train_time, avg_mse))

results = []

In [4]:
results.append(benchmark(1, MLPClassifier(activation='relu', solver='adam', learning_rate_init=0.001, max_iter=50)))
results.append(benchmark(2, MLPClassifier(activation='relu', solver='adam', learning_rate_init=0.001, max_iter=100)))
results.append(benchmark(3, MLPClassifier(activation='relu', solver='adam', learning_rate_init=0.05, max_iter=50)))
results.append(benchmark(4, MLPClassifier(activation='tanh', solver='lbfgs')))
results.append(benchmark(5, MLPClassifier(solver='adam', learning_rate_init=0.001, hidden_layer_sizes=(100,100))))
results.append(benchmark(6, MLPClassifier(solver='sgd', learning_rate_init=0.0003, momentum=0.98)))
results.append(benchmark(7, MLPClassifier(solver='adam', learning_rate_init=0.0001)))

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=50, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
1. AVG: 0.8256, MIN: 0.8151, MAX: 0.8377, TIME: 8329.008, AVGMSE: 2.7989
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
2. AVG: 0.8256, MIN: 0.8201, MAX: 0.8298, TIME: 8617.799, AV



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.0003, max_iter=200, momentum=0.98,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
6. AVG: 0.8716, MIN: 0.8663, MAX: 0.8761, TIME: 52515.334, AVGMSE: 2.0568
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.0001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
7. AVG: 0.8273, MIN: 0.8208, MAX: 0.8326, TIME: 24592.46

In [None]:
rprint chkesults.append(benchmark(8, MLPClassifier(solver='sgd', learning_rate_init=0.0001, hidden_layer_sizes=(100,100))))


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.0001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
8. AVG: 0.6000, MIN: 0.5939, MAX: 0.6031, TIME: 2572.391, AVGMSE: 6.4000


In [None]:
results.append(benchmark(9, MLPClassifier(activation='relu', max_iter=25, batch_size=5)))
results.append(benchmark(10, MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=(100,100), random_state=1, activation='logistic', early_stopping=True)))
results.append(benchmark(11, MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=(100,100), activation='logistic')))

