In [1]:
import numpy as np
from preprocess import preprocess_treebank
from features_extract import features_extract
import results

Using TensorFlow backend.


In [2]:
# loads data
data = preprocess_treebank()
X_train = data[0]
y_train = data[1]
X_test = data[2]
y_test = data[3]

In [12]:
# extract unigram binary features
X_train_bin_uni, X_test_bin_uni = features_extract(X_train, X_test, type="binary", ngram_range=(1,1))
# extract unigram tfidf features
X_train_tfidf_uni, X_test_tfidf_uni = features_extract(X_train, X_test, type="tf-idf", ngram_range=(1,1))
# extract bigram binary features
X_train_bin_bi, X_test_bin_bi = features_extract(X_train, X_test, type="binary", ngram_range=(1,2))
# extract bigram tfidf features
X_train_tfidf_bi, X_test_tfidf_bi = features_extract(X_train, X_test, type="tf-idf", ngram_range=(1,2))

In [13]:
# load the equally weighted glove embeddings
X_train_avg_embed_glove = np.genfromtxt('../data/X_treebank_train_embedded_avg_glove.csv', delimiter=",")
X_test_avg_embed_glove = np.genfromtxt('../data/X_treebank_test_embedded_avg_glove.csv', delimiter=",")

In [18]:
# load the tf-idf weighted glove embeddings
X_train_tfidf_embed_glove = np.genfromtxt('../data/X_treebank_train_embedded_tfidf_glove.csv', delimiter=",")
X_test_tfidf_embed_glove = np.genfromtxt('../data/X_treebank_test_embedded_tfidf_glove.csv', delimiter=",")

In [22]:
# load the equally weighted spacy embeddings
X_train_avg_embed_spacy = np.genfromtxt('../data/X_treebank_train_embedded_avg_spacy.csv', delimiter=",")
X_test_avg_embed_spacy = np.genfromtxt('../data/X_treebank_test_embedded_avg_spacy.csv', delimiter=",")

In [23]:
# load the tf-idf weighted spacy embeddings
X_train_tfidf_embed_spacy = np.genfromtxt('../data/X_treebank_train_embedded_tfidf_spacy.csv', delimiter=",")
X_test_tfidf_embed_spacy = np.genfromtxt('../data/X_treebank_test_embedded_tfidf_spacy.csv', delimiter=",")

In [14]:
# train and test the models on unigram binary features
models_bin_uni = results.results(X_train_bin_uni, y_train, X_test_bin_uni, y_test, features = "binary")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l2'} with score: 0.7884741479000011
  > Training...
  > Testing...
    Test accuracy: 0.8132894014277869

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 1} with score: 0.7813330790047806
  > Training...
  > Testing...
    Test accuracy: 0.7907742998352554

  > Bernoulli Naive Bayes SVM: 
  > Evaluating...
    Best parameters: {'C': 1, 'beta': 0.5} with score: 0.7905842801111098
  > Training...
  > Testing...
    Test accuracy: 0.8204283360790774

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': (0.5, 0.5)} with score: 0.7820006914557528
  > Training...
  > Testing...
    Test accuracy: 0.8220757825370676


In [15]:
# train and test the models on bigram binary features
models_bin_bi = results.results(X_train_bin_bi, y_train, X_test_bin_bi, y_test, features = "binary")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l2'} with score: 0.7820960646630345
  > Training...
  > Testing...
    Test accuracy: 0.7990115321252059

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 1} with score: 0.7754199401533124
  > Training...
  > Testing...
    Test accuracy: 0.7742998352553542

  > Bernoulli Naive Bayes SVM: 
  > Evaluating...
    Best parameters: {'C': 100, 'beta': 0.75} with score: 0.7806893098556288
  > Training...
  > Testing...
    Test accuracy: 0.8012081274025261

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': (0.5, 0.5)} with score: 0.7727971769530645
  > Training...
  > Testing...
    Test accuracy: 0.8116419549697969


In [16]:
# train and test the models on unigram tf_idf features
models_tfidf_uni = results.results(X_train_tfidf_uni, y_train, X_test_tfidf_uni, y_test, features = "tf_idf")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l2'} with score: 0.7885814427581932
  > Training...
  > Testing...
    Test accuracy: 0.8209774848984075

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 1} with score: 0.7808204480156412
  > Training...
  > Testing...
    Test accuracy: 0.8023064250411862

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': (0.5, 0.5)} with score: 0.7847784361178336
  > Training...
  > Testing...
    Test accuracy: 0.8121911037891268


In [17]:
# train and test the models on bigram tf_idf features
models_tfidf_bi = results.results(X_train_tfidf_bi, y_train, X_test_tfidf_bi, y_test, features = "tf_idf")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l2'} with score: 0.780152835564669
  > Training...
  > Testing...
    Test accuracy: 0.8110928061504667

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 1} with score: 0.774907309164173
  > Training...
  > Testing...
    Test accuracy: 0.7929708951125755

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': None} with score: 0.7764094371788606
  > Training...
  > Testing...
    Test accuracy: 0.8110928061504667


In [19]:
# train and test the models on the equally weighted glove embeddings
models_avg_embed_glove = results.results(X_train_avg_embed_glove, y_train, X_test_avg_embed_glove, y_test, features = "sentence_embed", D_in=200)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l2'} with score: 0.7883310880890786
  > Training...
  > Testing...
    Test accuracy: 0.7671609006040637

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 100} with score: 0.7877946137981188
  > Training...
  > Testing...
    Test accuracy: 0.7688083470620538

  > Feedforward NN:
  > Training...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
  > Testing...
    Test accuracy: 0.7699066445370548

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': None} with score: 0.6125701887197339
  > Training...
  > Testing...
    Test accuracy: 0.5403624382207578


In [20]:
# train and test the models on the tfidf weighted glove embeddings
models_tfidf_embed_glove = results.results(X_train_tfidf_embed_glove, y_train, X_test_tfidf_embed_glove, y_test, features = "sentence_embed", D_in=200)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l2'} with score: 0.7883310880890786
  > Training...
  > Testing...
    Test accuracy: 0.7671609006040637

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 100} with score: 0.7877946137981188
  > Training...
  > Testing...
    Test accuracy: 0.7688083470620538

  > Feedforward NN:
  > Training...
  > Testing...
    Test accuracy: 0.7764964300744289

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': None} with score: 0.6125701887197339
  > Training...
  > Testing...
    Test accuracy: 0.5403624382207578


In [24]:
# train and test the models on the equally weighted spacy embeddings
models_avg_embed_spacy = results.results(X_train_avg_embed_spacy, y_train, X_test_avg_embed_spacy, y_test, features = "sentence_embed", D_in=300)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l1'} with score: 0.8259319750599063
  > Training...
  > Testing...
    Test accuracy: 0.8094453596924767

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 0.01} with score: 0.827064531896377
  > Training...
  > Testing...
    Test accuracy: 0.8088962108731467

  > Feedforward NN:
  > Training...
  > Testing...
    Test accuracy: 0.8110928065759805

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': (0.25, 0.75)} with score: 0.6677555107831332
  > Training...
  > Testing...
    Test accuracy: 0.7660626029654036


In [25]:
# train and test the models on the tfidf weighted spacy embeddings
models_tfidf_embed_spacy = results.results(X_train_tfidf_embed_spacy, y_train, X_test_tfidf_embed_spacy, y_test, features = "sentence_embed", D_in=300)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l1'} with score: 0.8235595665287729
  > Training...
  > Testing...
    Test accuracy: 0.7913234486545854

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 0.01} with score: 0.8240245109142714
  > Training...
  > Testing...
    Test accuracy: 0.7907742998352554

  > Feedforward NN:
  > Training...
  > Testing...
    Test accuracy: 0.7984623834368032

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': (0.25, 0.75)} with score: 0.7170276939950644
  > Training...
  > Testing...
    Test accuracy: 0.7468423942888522


In [None]:
import importlib
importlib.reload(results)