In [1]:
# imports
# -------
#import re
import pandas as pd
import numpy as np

#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline

#import string
from collections import Counter

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras import backend as K

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /home/jmlp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jmlp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load data
# ---------
df_train_BASE  = pd.read_csv('train_E6oV3lV.csv')
df_test_BASE = pd.read_csv('test_tweets_anuFYb8.csv')

# check training labels
print df_train_BASE['label'].value_counts()

# ### CHECKPOINT START ###
print("   = TRAIN =")
print df_train_BASE.info()
print df_train_BASE.shape
print(df_train_BASE.head())
print("   = TEST =")
print df_test_BASE.info()
print df_test_BASE.shape
print(df_test_BASE.tail())
# ### CHECKPOINT STOP ###

0    29720
1     2242
Name: label, dtype: int64
   = TRAIN =
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB
None
(31962, 3)
   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation
   = TEST =
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
id       17197 non-null int64
tweet    17197 non-null object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB
None
(17197, 2)
          id                           

In [3]:
# combine training and testing sets, for easier data cleaning
# -----------------------------------------------------------
df_train_test = df_train_BASE.append(df_test_BASE, ignore_index=True)
#df_train_test = df_train_BASE.append(df_test_BASE, ignore_index=True).head(200)

#print(df_train_test) # sanity check


# data cleaning
# -------------
print("   = before data cleaning =")
print(df_train_test.head(10))


# remove usernames and hashtags --> this information may be relevant for a deeper analysis, but it will be ignored for now
df_train_test['tweet'] = df_train_test['tweet'].str.replace("@[A-Za-z0-9_]+","") # usernames
df_train_test['tweet'] = df_train_test['tweet'].str.replace("#[A-Za-z0-9_]+","") # hashtags

# remove special characters, numbers, punctuations
df_train_test['tweet'] = df_train_test['tweet'].str.replace("[^a-zA-Z#]", " ")

# force lowercase --> reduce the feature space
df_train_test['tweet'] = df_train_test['tweet'].apply(lambda x: ' '.join([word.lower() for word in x.split()]))


# remove/correct misspelled words (ex: desparately), slang (ex: cuz, u), and others (ex: juuuuuust) --> reduce the feature space
# ### NOT ENOUGH TIME TO IMPLEMENT ###

# remove short words --> this is not the best approach, as it indiscriminately removes potentially useful words --> remove stopwords instead
df_train_test['tweet'] = df_train_test['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# remove stopwords, in order to keep only sentiment-relevant words --> this still removes some relevant words (ex: not), meaning that a more careful analysis is required
stopwords_list = stopwords.words("english")
df_train_test['tweet'] = df_train_test['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))


# perform stemming --> stemmer operates on a single word without knowledge of the context --> perform lemmatization instead
#stemmer = PorterStemmer()
#df_train_test['tweet'] = df_train_test['tweet'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# perform lemmatization: group together the inflected forms of a word --> reduce the feature space
lemmatizer = WordNetLemmatizer()
df_train_test['tweet'] = df_train_test['tweet'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word,'v') for word in x.split()]))

# perform Part-of-Speech (PoS) tagging --> ### NOT ENOUGH TIME TO CHECK IF IT CAN BE USEFUL ###
#df_train_test['PoS_tweet'] = df_train_test['tweet'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))


print("   = after data cleaning =")
print(df_train_test.head(10))

   = before data cleaning =
   id  label                                              tweet
0   1    0.0   @user when a father is dysfunctional and is s...
1   2    0.0  @user @user thanks for #lyft credit i can't us...
2   3    0.0                                bihday your majesty
3   4    0.0  #model   i love u take with u all the time in ...
4   5    0.0             factsguide: society now    #motivation
5   6    0.0  [2/2] huge fan fare and big talking before the...
6   7    0.0   @user camping tomorrow @user @user @user @use...
7   8    0.0  the next school year is the year for exams.ð...
8   9    0.0  we won!!! love the land!!! #allin #cavs #champ...
9  10    0.0   @user @user welcome here !  i'm   it's so #gr...
   = after data cleaning =
   id  label                                              tweet
0   1    0.0  father dysfunctional selfish drag kid dysfunction
1   2    0.0   thank credit use cause offer wheelchair vans pdx
2   3    0.0                                     

In [4]:
# feature extraction
# ------------------
max_features = 1000

# obtain Bag-of-Words (BoW), so it can be used to compute term frequencies --> generates a very large feature space, which must be kept small artificially
#count_vectorizer = CountVectorizer()
count_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=max_features, stop_words='english')

count_vectorizer__train_test = count_vectorizer.fit_transform(df_train_test['tweet'])
BoW_count__train_test = count_vectorizer.get_feature_names()
termfreq__train_test = count_vectorizer__train_test.toarray()

print BoW_count__train_test
print termfreq__train_test
print "Non-zero elements:", np.count_nonzero(termfreq__train_test)

# compute tf-idf frequencies: reflects how important a word is to a document in a collection --> still generates a very large feature space, which must be kept small artificially
#tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=max_features, stop_words='english')

tfidf_vectorizer__train_test = tfidf_vectorizer.fit_transform(df_train_test['tweet'])
BoW_tfidf__train_test = tfidf_vectorizer.get_feature_names()
tfidf__train_test = tfidf_vectorizer__train_test.toarray()

print BoW_tfidf__train_test
print tfidf__train_test
print "Non-zero elements:", np.count_nonzero(tfidf__train_test)

# compute hashed frequencies: performs feature hashing ('hashing trick') to control the size of the feature space naturally
hashfreq_vectorizer = HashingVectorizer(n_features=max_features)

hashfreq_vectorizer__train_test = hashfreq_vectorizer.transform(df_train_test['tweet'])
hashfreq__train_test = hashfreq_vectorizer__train_test.toarray()

print hashfreq__train_test
print "Non-zero elements:", np.count_nonzero(hashfreq__train_test)


# compute N-grams (N=2): probabilistic language model for predicting the next word in a word sequence (ex: tweet)
# ### NOT ENOUGH TIME TO IMPLEMENT ###


# compute one-hot encoding on the tweets, as a preparation for computing word embeddings (using word2vec)
#   integer encode the tweets
vocab_size = 5000
onehot__train_test = [one_hot(tweet, vocab_size) for tweet in df_train_test['tweet']]

#   truncate and pad the tweets: normalization required for computing the word embeddings
max_tweet_length = 20
onehot__train_test = sequence.pad_sequences(onehot__train_test, maxlen=max_tweet_length)

print onehot__train_test[0:5]


[u'able', u'absolutely', u'abt', u'accept', u'account', u'act', u'action', u'actually', u'adapt', u'add', u'adventure', u'affect', u'afternoon', u'age', u'ago', u'agree', u'ahead', u'aicle', u'air', u'album', u'alive', u'alligator', u'allow', u'amaze', u'america', u'american', u'americans', u'amp', u'angry', u'animals', u'anniversary', u'announce', u'answer', u'anti', u'anymore', u'app', u'apparently', u'appreciate', u'arrive', u'ask', u'ass', u'attack', u'attend', u'attention', u'august', u'available', u'award', u'away', u'awesome', u'aww', u'baby', u'bad', u'bag', u'balance', u'ball', u'ban', u'band', u'bank', u'bar', u'base', u'bday', u'beach', u'bear', u'beat', u'beautiful', u'beauty', u'bed', u'beer', u'begin', u'believe', u'benefit', u'best', u'bet', u'better', u'big', u'biggest', u'bihday', u'bike', u'bing', u'bird', u'bitch', u'bite', u'black', u'blame', u'bless', u'block', u'blog', u'blow', u'blue', u'board', u'boat', u'body', u'bong', u'book', u'bore', u'boss', u'bottle', u'b

[u'able', u'absolutely', u'abt', u'accept', u'account', u'act', u'action', u'actually', u'adapt', u'add', u'adventure', u'affect', u'afternoon', u'age', u'ago', u'agree', u'ahead', u'aicle', u'air', u'album', u'alive', u'alligator', u'allow', u'amaze', u'america', u'american', u'americans', u'amp', u'angry', u'animals', u'anniversary', u'announce', u'answer', u'anti', u'anymore', u'app', u'apparently', u'appreciate', u'arrive', u'ask', u'ass', u'attack', u'attend', u'attention', u'august', u'available', u'award', u'away', u'awesome', u'aww', u'baby', u'bad', u'bag', u'balance', u'ball', u'ban', u'band', u'bank', u'bar', u'base', u'bday', u'beach', u'bear', u'beat', u'beautiful', u'beauty', u'bed', u'beer', u'begin', u'believe', u'benefit', u'best', u'bet', u'better', u'big', u'biggest', u'bihday', u'bike', u'bing', u'bird', u'bitch', u'bite', u'black', u'blame', u'bless', u'block', u'blog', u'blow', u'blue', u'board', u'boat', u'body', u'bong', u'book', u'bore', u'boss', u'bottle', u'b

Non-zero elements: 145459
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Non-zero elements: 244307
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
  3818 4384 2013 1755 2143 4700]
 [   0    0    0    0    0    0    0    0    0    0    0    0  541 4251
  1142 4685 1509 1445   39 2756]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 3863   70]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0  856 4473 4203]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0  821 2670]]


In [5]:
# split training and testing feature samples, according to the original separation
# --------------------------------------------------------------------------------
termfreq_train = termfreq__train_test[:31962,:]
termfreq_test = termfreq__train_test[31962:,:]

tfidf_train = tfidf__train_test[:31962,:]
tfidf_test = tfidf__train_test[31962:,:]

hashfreq_train = hashfreq__train_test[:31962,:]
hashfreq_test = hashfreq__train_test[31962:,:]

onehot_train = onehot__train_test[:31962,:]
onehot_test = onehot__train_test[31962:,:]

# split training data into training and validation sets
# -----------------------------------------------------
x_termfreq_train, x_termfreq_valid, y_termfreq_train, y_termfreq_valid = train_test_split(termfreq_train, df_train_BASE['label'], random_state=42, test_size=0.3)
x_tfidf_train, x_tfidf_valid, y_tfidf_train, y_tfidf_valid = train_test_split(tfidf_train, df_train_BASE['label'], random_state=42, test_size=0.3)
x_hashfreq_train, x_hashfreq_valid, y_hashfreq_train, y_hashfreq_valid = train_test_split(hashfreq_train, df_train_BASE['label'], random_state=42, test_size=0.3)
x_onehot_train, x_onehot_valid, y_onehot_train, y_onehot_valid = train_test_split(onehot_train, df_train_BASE['label'], random_state=42, test_size=0.3)

print df_train_BASE['label'].value_counts()
print "termfreq:", Counter(y_termfreq_train), Counter(y_termfreq_valid)
print "tfidf:", Counter(y_tfidf_train), Counter(y_tfidf_valid)
print "hashfreq:", Counter(y_hashfreq_train), Counter(y_hashfreq_valid)
print "onehot:", Counter(y_onehot_train), Counter(y_onehot_valid)

0    29720
1     2242
Name: label, dtype: int64
termfreq: Counter({0: 20815, 1: 1558}) Counter({0: 8905, 1: 684})
tfidf: Counter({0: 20815, 1: 1558}) Counter({0: 8905, 1: 684})
hashfreq: Counter({0: 20815, 1: 1558}) Counter({0: 8905, 1: 684})
onehot: Counter({0: 20815, 1: 1558}) Counter({0: 8905, 1: 684})


In [6]:
# data modelling and evaluation - Logistic Regression
# ---------------------------------------------------
print "### Logistic Regression ###"

def LogisticRegression_prediction(x_train, y_train, x_valid, y_valid, x_test, threshold):
    logistic_regression = LogisticRegression()
    
    # training the model
    logistic_regression.fit(x_train, y_train)
    
    # computing probability predictions
    pred_train = logistic_regression.predict_proba(x_train)
    pred_valid = logistic_regression.predict_proba(x_valid)
    pred_test = logistic_regression.predict_proba(x_test)
    
    # compute binary predictions from probabilities
    pred_int_train = (pred_train[:,1] >= threshold).astype(np.int)
    pred_int_valid = (pred_valid[:,1] >= threshold).astype(np.int)
    pred_int_test = (pred_test[:,1] >= threshold).astype(np.int)
    
    # compute f-measure
    print "f-measure TRAIN:", f1_score(y_train, pred_int_train)
    print "f-measure VALID:", f1_score(y_valid, pred_int_valid)
    
    return pred_int_test

# parameters --> should be chosen through grid search
threshold = 0.20

print "   = term frequency ="
(pred_LR_termfreq_test) = LogisticRegression_prediction(x_termfreq_train, y_termfreq_train, x_termfreq_valid, y_termfreq_valid, termfreq_test, threshold)

print "   = tfidf ="
(pred_LR_tfidf_test) = LogisticRegression_prediction(x_tfidf_train, y_tfidf_train, x_tfidf_valid, y_tfidf_valid, tfidf_test, threshold)

print "   = hashed frequencies ="
(pred_LR_hashfreq_test) = LogisticRegression_prediction(x_hashfreq_train, y_hashfreq_train, x_hashfreq_valid, y_hashfreq_valid, hashfreq_test, threshold)

### Logistic Regression ###
   = term frequency =
f-measure TRAIN: 0.5742251223491027
f-measure VALID: 0.44981132075471697
   = tfidf =
f-measure TRAIN: 0.5533442088091354
f-measure VALID: 0.463768115942029
   = hashed frequencies =
f-measure TRAIN: 0.5254569190600522
f-measure VALID: 0.42997728993186973


In [17]:
# data modelling and evaluation - Support Vector Machine
# ------------------------------------------------------
print "### Support Vector Machine ###"

def SupportVectorMachine_prediction(x_train, y_train, x_valid, y_valid, x_test, C, threshold):
    SVM = SVC(C=C, kernel='linear', probability=True)
    
    # training the model
    SVM.fit(x_train, y_train);
    
    # computing probability predictions
    pred_prob_train = SVM.predict_proba(x_train);
    pred_prob_valid = SVM.predict_proba(x_valid);
    pred_prob_test = SVM.predict_proba(x_test);
    
    # compute binary predictions from probabilities
    pred_int_train = (pred_prob_train[:,1] >= threshold).astype(np.int)
    pred_int_valid = (pred_prob_valid[:,1] >= threshold).astype(np.int)
    pred_int_test = (pred_prob_test[:,1] >= threshold).astype(np.int)
    
    # compute f-measure
    print "f-measure TRAIN:", f1_score(y_train, pred_int_train)
    print "f-measure VALID:", f1_score(y_valid, pred_int_valid)
    
    return pred_int_test

# parameters --> should be chosen through grid search
C = 1
threshold = 0.2

# ### DO NOT EXECUTE, IT TAKES A VERY LONG TIME TO RUN <-- TOO MANY FEATURES ###

print "   = term frequency ="
(pred_SVM_termfreq_test) = SupportVectorMachine_prediction(x_termfreq_train, y_termfreq_train, x_termfreq_valid, y_termfreq_valid, termfreq_test, C, threshold)

print "   = tfidf ="
(pred_SVM_tfidf_test) = SupportVectorMachine_prediction(x_tfidf_train, y_tfidf_train, x_tfidf_valid, y_tfidf_valid, tfidf_test, C, threshold)

print "   = hashed frequencies ="
(pred_SVM_hashfreq_test) = SupportVectorMachine_prediction(x_hashfreq_train, y_hashfreq_train, x_hashfreq_valid, y_hashfreq_valid, hashfreq_test, C, threshold)


### Support Vector Machine ###
   = term frequency =
f-measure TRAIN: 0.5194805194805195
f-measure VALID: 0.42923219241443106
   = tfidf =
f-measure TRAIN: 0.4782608695652173
f-measure VALID: 0.41505791505791506
   = hashed frequencies =
f-measure TRAIN: 0.5292479108635098
f-measure VALID: 0.3951048951048951


In [8]:
# data modelling and evaluation - Decision Tree
# ---------------------------------------------
print "### Decision Tree ###"

def DecisionTree_prediction(x_train, y_train, x_valid, y_valid, x_test, max_depth, min_samples_leaf):
    decision_tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    
    # training the model
    decision_tree.fit(x_train, y_train);
    
    # compute binary predictions
    pred_train = decision_tree.predict(x_train);
    pred_valid = decision_tree.predict(x_valid);
    pred_test = decision_tree.predict(x_test);
    
    # compute f-measure
    print "f-measure TRAIN:", f1_score(y_train, pred_train)
    print "f-measure VALID:", f1_score(y_valid, pred_valid)
    
    return pred_test

# parameters --> should be chosen through grid search
max_depth = 1000
min_samples_leaf = 50

print "   = term frequency ="
(pred_DT_termfreq_test) = DecisionTree_prediction(x_termfreq_train, y_termfreq_train, x_termfreq_valid, y_termfreq_valid, termfreq_test, max_depth, min_samples_leaf)

print "   = tfidf ="
(pred_DT_tfidf_test) = DecisionTree_prediction(x_tfidf_train, y_tfidf_train, x_tfidf_valid, y_tfidf_valid, tfidf_test, max_depth, min_samples_leaf)

print "   = hashed frequencies ="
(pred_DT_hashfreq_test) = DecisionTree_prediction(x_hashfreq_train, y_hashfreq_train, x_hashfreq_valid, y_hashfreq_valid, hashfreq_test, max_depth, min_samples_leaf)


### Decision Tree ###
   = term frequency =
f-measure TRAIN: 0.2980030721966206
f-measure VALID: 0.2942528735632184
   = tfidf =
f-measure TRAIN: 0.33984375
f-measure VALID: 0.3179255918827509
   = hashed frequencies =
f-measure TRAIN: 0.2679671457905544
f-measure VALID: 0.24470588235294116


In [9]:
# data modelling and evaluation - Naive Bayes
# -------------------------------------------
print "### Naive Bayes ###"

def NaiveBayes_prediction(x_train, y_train, x_valid, y_valid, x_test, threshold):
    naive_bayes = GaussianNB()
    
    # training the model
    naive_bayes.fit(x_train, y_train);
    
    # compute probability predictions
    pred_train = naive_bayes.predict(x_train);
    pred_valid = naive_bayes.predict(x_valid);
    pred_test = naive_bayes.predict(x_test);
    
    # compute f-measure
    print "f-measure TRAIN:", f1_score(y_train, pred_train)
    print "f-measure VALID:", f1_score(y_valid, pred_valid)
    
    return pred_test

# parameters --> should be chosen through grid search
threshold = 0.2

print "   = term frequency ="
(pred_DT_termfreq_test) = NaiveBayes_prediction(x_termfreq_train, y_termfreq_train, x_termfreq_valid, y_termfreq_valid, termfreq_test, threshold)

print "   = tfidf ="
(pred_DT_tfidf_test) = NaiveBayes_prediction(x_tfidf_train, y_tfidf_train, x_tfidf_valid, y_tfidf_valid, tfidf_test, threshold)

print "   = hashed frequencies ="
(pred_DT_hashfreq_test) = NaiveBayes_prediction(x_hashfreq_train, y_hashfreq_train, x_hashfreq_valid, y_hashfreq_valid, hashfreq_test, threshold)

### Naive Bayes ###
   = term frequency =
f-measure TRAIN: 0.19450686641697876
f-measure VALID: 0.17882836587872558
   = tfidf =
f-measure TRAIN: 0.201019289078124
f-measure VALID: 0.1833358628016391
   = hashed frequencies =
f-measure TRAIN: 0.17500862961684502
f-measure VALID: 0.15498056039683603


In [10]:
# define metric 'F-measure' (no longer available in Keras) for evaluating Deep Learning models
def fmeasure(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        
        recall = true_positives / (possible_positives + K.epsilon()) # epsilon: avoids 'division by 0'
        
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        
        precision = true_positives / (predicted_positives + K.epsilon()) # epsilon: avoids 'division by 0'
        
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon())) # epsilon: avoids 'division by 0'

In [11]:
# data modelling and evaluation - MLP + term frequencies
# ------------------------------------------------------
# build the model
num_units = 64

MLP_model = Sequential()

MLP_model.add(Dense(num_units, input_dim=max_features, activation='relu'))
MLP_model.add(Dense(1, activation='sigmoid'))

MLP_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=[fmeasure, 'accuracy'])

print MLP_model.summary()

# train the model
MLP_model.fit(x_termfreq_train, y_termfreq_train, validation_data=(x_termfreq_valid, y_termfreq_valid), epochs=10, batch_size=64)

# evaluate the model
metrics_MLP_train = MLP_model.evaluate(x_termfreq_train, y_termfreq_train, verbose=0)
metrics_MLP_valid = MLP_model.evaluate(x_termfreq_valid, y_termfreq_valid, verbose=0)

print "f-measure TRAIN:", metrics_MLP_train[1]
print "f-measure VALID:", metrics_MLP_valid[1]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                64064     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 64,129
Trainable params: 64,129
Non-trainable params: 0
_________________________________________________________________
None
Train on 22373 samples, validate on 9589 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
f-measure TRAIN: 0.3745053441220901
f-measure VALID: 0.2888379033830917


In [14]:
# data modelling and evaluation - LSTM + word2vec
# -----------------------------------------------
# build the model
embedding_vector_length = 8
num_memory_units = 5

LSTM_model = Sequential()

LSTM_model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_tweet_length)) # word embedding layer
LSTM_model.add(LSTM(num_memory_units)) # LSTM layer
LSTM_model.add(Dense(1, activation='sigmoid')) # output layer

LSTM_model.compile(loss='binary_crossentropy',optimizer='adam', metrics=[fmeasure, 'accuracy'])

print LSTM_model.summary()

# train the model
LSTM_model.fit(x_onehot_train, y_onehot_train, validation_data=(x_onehot_valid, y_onehot_valid), epochs=10, batch_size=64)

# evaluate the model
metrics_LSTM_train = LSTM_model.evaluate(x_onehot_train, y_onehot_train, verbose=0) 
metrics_LSTM_valid = LSTM_model.evaluate(x_onehot_valid, y_onehot_valid, verbose=0) 

print "f-measure TRAIN:", metrics_LSTM_train[1]
print "f-measure VALID:", metrics_LSTM_valid[1]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 8)             40000     
_________________________________________________________________
lstm_3 (LSTM)                (None, 5)                 280       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 6         
Total params: 40,286
Trainable params: 40,286
Non-trainable params: 0
_________________________________________________________________
None
Train on 22373 samples, validate on 9589 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
f-measure TRAIN: 0.7783411642124329
f-measure VALID: 0.4193384347761096
