In [2]:
import numpy as np
from preprocess import preprocess_twitter
from features_extract import features_extract
import results

Using TensorFlow backend.


In [3]:
# loads data
data = preprocess_twitter(preprocess=True)
X_train = data[0]
y_train = data[1]
X_test = data[2]
y_test = data[3]

In [19]:
# extract unigram binary features
X_train_bin_uni, X_test_bin_uni = features_extract(X_train, X_test, type="binary", ngram_range=(1,1))
# extract unigram tfidf features
X_train_tfidf_uni, X_test_tfidf_uni = features_extract(X_train, X_test, type="tf-idf", ngram_range=(1,1))
# extract bigram binary features
X_train_bin_bi, X_test_bin_bi = features_extract(X_train, X_test, type="binary", ngram_range=(1,2))
# extract bigram tfidf features
X_train_tfidf_bi, X_test_tfidf_bi = features_extract(X_train, X_test, type="tf-idf", ngram_range=(1,2))

In [4]:
# load the equally weighted glove embeddings
X_train_avg_embed_glove = np.genfromtxt('../data/X_twitter_train_embedded_avg_glove.csv', delimiter=",")
X_test_avg_embed_glove = np.genfromtxt('../data/X_twitter_test_embedded_avg_glove.csv', delimiter=",")

In [5]:
# load the tf-idf weighted glove embeddings
X_train_tfidf_embed_glove = np.genfromtxt('../data/X_twitter_train_embedded_tfidf_glove.csv', delimiter=",")
X_test_tfidf_embed_glove = np.genfromtxt('../data/X_twitter_test_embedded_tfidf_glove.csv', delimiter=",")

In [4]:
# load the equally weighted spacy embeddings
X_train_avg_embed_spacy = np.genfromtxt('../data/X_twitter_train_embedded_avg_spacy.csv', delimiter=",")
X_test_avg_embed_spacy = np.genfromtxt('../data/X_twitter_test_embedded_avg_spacy.csv', delimiter=",")

In [7]:
# load the tf-idf weighted spacy embeddings
X_train_tfidf_embed_spacy = np.genfromtxt('../data/X_twitter_train_embedded_tfidf_spacy.csv', delimiter=",")
X_test_tfidf_embed_spacy = np.genfromtxt('../data/X_twitter_test_embedded_tfidf_spacy.csv', delimiter=",")

In [20]:
# train and test the models on unigram binary features
models_bin_uni = results.results(X_train_bin_uni, y_train, X_test_bin_uni, y_test, features = "binary")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l1'} with score: 0.7741
  > Training...
  > Testing...
    Test accuracy: 0.7966573816155988

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 0.01} with score: 0.774675
  > Training...
  > Testing...
    Test accuracy: 0.7771587743732591

  > Bernoulli Naive Bayes SVM: 
  > Evaluating...
    Best parameters: {'C': 0.01, 'beta': 0.5} with score: 0.77775
  > Training...
  > Testing...
    Test accuracy: 0.7827298050139275

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': None} with score: 0.7647875
  > Training...
  > Testing...
    Test accuracy: 0.7799442896935933


In [21]:
# train and test the models on bigram binary features
models_bin_bi = results.results(X_train_bin_bi, y_train, X_test_bin_bi, y_test, features = "binary")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l1'} with score: 0.7777
  > Training...
  > Testing...
    Test accuracy: 0.7938718662952646

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 0.01} with score: 0.7785
  > Training...
  > Testing...
    Test accuracy: 0.7883008356545961

  > Bernoulli Naive Bayes SVM: 
  > Evaluating...
    Best parameters: {'C': 0.01, 'beta': 0.5} with score: 0.78045
  > Training...
  > Testing...
    Test accuracy: 0.7966573816155988

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': None} with score: 0.7662
  > Training...
  > Testing...
    Test accuracy: 0.7855153203342619


In [22]:
# train and test the models on unigram tf_idf features
models_tfidf_uni = results.results(X_train_tfidf_uni, y_train, X_test_tfidf_uni, y_test, features = "tf_idf")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l2'} with score: 0.7777125
  > Training...
  > Testing...
    Test accuracy: 0.7855153203342619

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 1} with score: 0.76955
  > Training...
  > Testing...
    Test accuracy: 0.7855153203342619

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': None} with score: 0.763575
  > Training...
  > Testing...
    Test accuracy: 0.7883008356545961


In [23]:
# train and test the models on bigram tf_idf features
models_tfidf_bi = results.results(X_train_tfidf_bi, y_train, X_test_tfidf_bi, y_test, features = "tf_idf")


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l1'} with score: 0.7824625
  > Training...
  > Testing...
    Test accuracy: 0.7827298050139275

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 1} with score: 0.7761375
  > Training...
  > Testing...
    Test accuracy: 0.7994428969359332

  > Multinomial Naive Bayes: 
  > Evaluating...
    Best parameters: {'class_prior': None} with score: 0.769225
  > Training...
  > Testing...
    Test accuracy: 0.8050139275766016


In [24]:
# train and test the models on the equally weighted glove embeddings
models_avg_embed_glove = results.results(X_train_avg_embed_glove, y_train, X_test_avg_embed_glove, y_test, features = "sentence_embed", D_in=200)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 100, 'penalty': 'l2'} with score: 0.7459
  > Training...
  > Testing...
    Test accuracy: 0.7604456824512534

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 100} with score: 0.7461
  > Training...
  > Testing...
    Test accuracy: 0.7688022284122563

  > Feedforward NN:
  > Training...
Instructions for updating:
Use tf.cast instead.
  > Testing...
    Test accuracy: 0.7715877365933155

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': (0.25, 0.75)} with score: 0.67975
  > Training...
  > Testing...
    Test accuracy: 0.6991643454038997


In [25]:
# train and test the models on the tfidf weighted glove embeddings
models_tfidf_embed_glove = results.results(X_train_tfidf_embed_glove, y_train, X_test_tfidf_embed_glove, y_test, features = "sentence_embed", D_in=200)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 100, 'penalty': 'l2'} with score: 0.7459
  > Training...
  > Testing...
    Test accuracy: 0.7604456824512534

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 100} with score: 0.7461
  > Training...
  > Testing...
    Test accuracy: 0.7688022284122563

  > Feedforward NN:
  > Training...
  > Testing...
    Test accuracy: 0.7660167192350191

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': (0.25, 0.75)} with score: 0.67975
  > Training...
  > Testing...
    Test accuracy: 0.6991643454038997


In [26]:
# train and test the models on the equally weighted spacy embeddings
models_avg_embed_spacy = results.results(X_train_avg_embed_spacy, y_train, X_test_avg_embed_spacy, y_test, features = "sentence_embed", D_in=300)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 100, 'penalty': 'l1'} with score: 0.761475
  > Training...
  > Testing...
    Test accuracy: 0.8022284122562674

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 100} with score: 0.761075
  > Training...
  > Testing...
    Test accuracy: 0.8022284122562674

  > Feedforward NN:
  > Training...
  > Testing...
    Test accuracy: 0.8245125353170305

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': (0.25, 0.75)} with score: 0.6693125
  > Training...
  > Testing...
    Test accuracy: 0.6350974930362117


In [27]:
# train and test the models on the tfidf weighted spacy embeddings
models_tfidf_embed_spacy = results.results(X_train_tfidf_embed_spacy, y_train, X_test_tfidf_embed_spacy, y_test, features = "sentence_embed", D_in=300)


  > Logistic Regression: 
  > Evaluating...
    Best parameters: {'C': 1, 'penalty': 'l1'} with score: 0.7421
  > Training...
  > Testing...
    Test accuracy: 0.7520891364902507

  > Linear SVM: 
  > Evaluating...
    Best parameters: {'C': 1} with score: 0.7416875
  > Training...
  > Testing...
    Test accuracy: 0.7520891364902507

  > Feedforward NN:
  > Training...
  > Testing...
    Test accuracy: 0.7688022289103452

  > Gaussian Naive Bayes: 
  > Evaluating...
    Best parameters: {'priors': (0.25, 0.75)} with score: 0.6272625
  > Training...
  > Testing...
    Test accuracy: 0.6796657381615598


In [46]:
import importlib
importlib.reload(results)

<module 'feedforward_NN' from 'C:\\Users\\Ayoub Elhanchi\\Desktop\\Studies\\Winter 2019\\COMP 551\\Projects\\Project 4\\AML-FINAL\\src\\feedforward_NN.py'>