In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from collections import defaultdict
from keras.wrappers.scikit_learn import KerasClassifier
import os
import sys
import re
import numpy as np
import keras
import keras.metrics
from sklearn.metrics import make_scorer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.metrics import categorical_accuracy
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, Conv2D, MaxPooling1D, Embedding, Flatten, concatenate, Activation
from keras.models import Model, Sequential
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_validate
import preprocessor as p
import operator
import pickle

p.set_options(p.OPT.URL, p.OPT.NUMBER, p.OPT.RESERVED, p.OPT.MENTION)

#bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')

DATA_PATH = "./data/tweet_no_dup.tsv"
GOOGLE_EMBEDDING_PATH = "./data/GoogleNews-vectors-negative300.bin"
TWEET_EMBEDDING_PATH = "./data/glove.twitter.27B.200d.txt"
MODEL_PATH = "./CNN.model"
BASE_DIR = 'data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')

MAX_SEQUENCE_LENGTH = 40
MAX_NUM_WORDS = 40000 #93428
EMBEDDING_DIM = 200
EPOCHS = 2


def load_sentences_from_df(train_df, id_field = 'id', sentiment_label='label', tweet_field='tweet', lower=True, clean=True):
    """
    Loads sentences.
    :param train_df: pandas.DataFrame containing labeled tweets.
    :return: sents (paired with labels), word doc freq, list of labels.
    """
    sents = []
    lbl = {'negative':0,
           'neutral':1,
          'positive':2}
    ids = set()
    word_df = defaultdict(int)        
    for line in train_df.iterrows():
        
        if not(line[1][id_field] in ids):
            ids.add(line[1][id_field])
            tweet = line[1][tweet_field]
            sentiment = line[1][sentiment_label]

            clean_text = tweet.lower() if lower else text
            clean_text = p.clean(clean_text) if clean else clean_text
            clean_text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', re.sub(r'[^\x00-\x7f]',r'', clean_text)) if clean else clean_text

            words = clean_text.split()
            for word in set(words):
                word_df[word] += 1
            pair = (words, lbl[sentiment])
            sents.append(pair)

    labels = [0] * len(lbl)
    for l,i in lbl.items():
        labels[i] = l
        
    return sents, word_df, labels


def load_sentences(train_file, tagField=1, textField=2, lower=True, no_dup = False, clean = True):
    """
    Loads sentences.
    :param train_file: filename containing labeled sentences in TSV format.
    :return: sents (paired with labels), word doc freq, list of labels.
    """
    sents = []
    tags = {'negative':0,
           'neutral':1,
          'positive':2}
    ids = set()
    word_df = defaultdict(int)
    with open(train_file, "r", encoding='utf-8') as f:
        for line in f:       
            fields = line.strip().split("\t")
            try: 
                if no_dup:
                    if not(fields[0] in ids):
                        ids.add(fields[0])
                        text = fields[textField]
                        tag = fields[tagField]
                        
                        if lower:
                            clean_text = text.lower()
                        clean_text = p.clean(clean_text)
                        words = clean_text.split()
                        for word in set(words):
                            word_df[word] += 1
                        pair = (words, tags[tag])
                        sents.append(pair)
                else:
                    text = fields[textField]
                    tag = fields[tagField]
                    
                    if lower:
                        clean_text = text.lower()
                    clean_text = p.clean(clean_text)
                    clean_text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', re.sub(r'[^\x00-\x7f]',r'', clean_text)) if clean else clean_text
                    words = clean_text.split()
                    for word in set(words):
                        word_df[word] += 1
                    pair = (words, tags[tag])
                    sents.append(pair)
            except(e):
                print(e.Message())
                continue;
    labels = [0] * len(tags)
    for tag,i in tags.items():
        labels[i] = tag
    return sents, word_df, labels

def split(df, pct):
    start = 0
    end = 0
    result = []

    for i in range(0,len(pct)):
        end = start+int(len(df)*pct[i])
        result.append(df.iloc[start:end])
        start=end
    
    return result


def createDatasets(df, labels, column, pct, shuffle=False):
    
    results_tmp = []
    result = []
    i=0
    
    for l in labels:
        d_tmp = df[df[column]==l]
        results_tmp.append(split(d_tmp,pct))


    for i in range(0,len(labels)):
        d=pd.DataFrame()
        for j in range(0,len(labels)):
            d=d.append(results_tmp[j][i])
        if shuffle:
            d=d.reindex(np.random.RandomState(seed=2).permutation(d.index))
        result.append(d)

    return result

def macroaveraged_recall(y_true, y_pred):
    """
    Compute recall for each class and average the result (see SemEval2017)
    :return: macroaveraged_recall.
    """
    n_class = len(y_true[0])
    true_vects = [[] for i in range(n_class)]
    pred_vects = [[] for i in range(n_class)]
    
    for i in range(len(y_true)):
        for j in range(n_class):
            true_vects[j].append(y_true[i][j])
            pred_vects[j].append(y_pred[i][j])
    
    recalls = [ recall_score(true_vects[i], pred_vects[i]) for i in range(n_class)]
    return recalls, np.average(recalls)

def get_dummy(n_out, strategy='most_frequent'):
    return np.array([[1., 0., 0.] for i in range(n_out)])

def create_baseline():
    tweet_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    tweet_encoder = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, weights=[embedding_matrix], 
                              input_length=MAX_SEQUENCE_LENGTH, trainable=True)(tweet_input)
    bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D()(bigram_branch)
    trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D()(trigram_branch)
    fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
    merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.2)(merged)
    merged = Dense(3)(merged)
    output = Activation('sigmoid')(merged)
    model_ZW = Model(inputs=[tweet_input], outputs=[output])
    model_ZW.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy'])
    return model_ZW

def get_test_data(test_sents, tokenizer, word_index):
    test_text = []
    test_labls = []

    for s in test_sents:
        test_text.append(s[0])
        test_labls.append(s[1])

    print('Found %s tweets.' % len(test_sents))

    tokenizer.word_index = word_index
    tokenizer.fit_on_texts(test_text)
    tokenizer.word_index = word_index
    test_sequences = tokenizer.texts_to_sequences(test_text)

    #test_word_index = word_index
    print('Found %s unique tokens.' % len(word_index))

    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    test_labels = to_categorical(np.asarray(test_labls))
    
    print('Shape of data tensor:', test_data.shape)
    print('Shape of label tensor:', test_labels.shape)

    x_test = test_data
    y_test = test_labels
    
    return x_test, y_test

def to_category(y_test_pred):
    y_test_mod = []
    for i in range(len(y_test_pred)):
        tmp = y_test_pred[i]
        y_test_mod.append([0.]*3)
        y_test_mod[-1][tmp.argmax()] = 1.
    y_test_mod = np.array(y_test_mod)
    return y_test_mod


def train_test_split_cv(x, y, n_iter, n_fold = 10):
    """
    ...
    """
   
    x=list(x)
    y=list(y)
    n_iter = n_fold if n_iter > n_fold else n_iter 
    ns_fold = int(len(x)/n_fold)
    
    test_start_idx = (n_iter-1)*ns_fold
    test_end_idx = (n_iter)*ns_fold
    
#    print("Test starts at "+str(test_start_idx)+" ending at "+str(test_end_idx))
    x_train, y_train = [], []
    
    if test_start_idx == 0:
        x_train = x[test_end_idx:]
        y_train = y[test_end_idx:]
    elif test_end_idx == len(x):
        x_train = x[:test_start_idx]
        y_train = y[:test_start_idx]
    else:
        x_train = x[:test_start_idx]+x[test_end_idx:] 
        y_train = y[:test_start_idx]+y[test_end_idx:] 

    x_test = x[test_start_idx:test_end_idx]
    y_test = y[test_start_idx:test_end_idx]

    
    return np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)

def cross_validation(x, y, n_fold = 3, n_repeat = 1, **params):
    #macro_averaged_recall, categorical accuracy on validation
    results = []
    batch_size=params['batch_size']
    epochs=params['epochs']
    ks = params['kernel_size']
    nf = params['n_filter']
    dropout = params['dropout']
    
    #random seed init
    step_tot = 0
    for it in range(n_repeat):
        fold_res = []
        for i in range(1, n_fold+1):
            x_train, y_train, x_val, y_val = train_test_split_cv(x, y, i, n_fold=n_fold)
            
            model = create_model(kernel_size = ks, n_filter = nf, dropout = dropout)

            model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)
            #predict + scoring su test/validation
            
            step_tot+=1
            y_pred = to_category(model.predict(x_val, batch_size = batch_size))
            garb, mavg = macroaveraged_recall(y_val, y_pred)#, categorical_accuracy(y_val, y_pred)
            
            print("##################################################################")
            print("Step #"+str(it+1)+"."+str(i)+" ("+str(step_tot)+"/"+str(n_repeat*n_fold)+") mavg_recall on validation = "+str(mavg)[:5])#+" - ca = "+str(ca))
            print("##################################################################")
            
            fold_res.append(mavg)
        step_avg, step_std = np.average(fold_res), np.std(fold_res)
        print("=========================> Average mavg_recall at step #"+str(it+1)+" = "+str(step_avg)[:5]+" +/-"+str(step_std)[:5]+"<=========================")
        results.append(step_avg)
    
    return np.average(results), np.std(results)

def grid_search(x, y, n_fold = 3, n_repeat = 1, embedding = "glovetweet200", **param_grid):
    results = dict({})
    for batch in param_grid['batch_size']:
        for ep in param_grid['epochs']:
            for n_filter in param_grid['n_filters']:
                for ker_size in param_grid['kernel_sizes']:
                    params = dict({
                        "batch_size":batch,
                        "epochs":ep,
                        "n_filter" : n_filter,
                        "kernel_size" : ker_size,
                        "dropout" : 0.
                    })
                    
                    model_tag = "b"+str(batch)+"-ep"+str(ep)+"-nf"+str(n_filter)+"-ks"+str(ker_size)+"-emb"+str(embedding)
                    print("Started a "+str(n_fold)+"-fold cv with "+model_tag)
                    results[model_tag] = cross_validation(x, y, n_fold = n_fold, n_repeat = n_repeat, **params)
    return results
        

def create_model(kernel_size = (2, 3, 4), n_filter = (100, 100, 100), dropout = 0.):
    tweet_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    tweet_encoder = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, weights=[embedding_matrix], 
                              input_length=MAX_SEQUENCE_LENGTH, trainable=True)(tweet_input)
    bigram_branch = Conv1D(filters=n_filter[0], kernel_size=kernel_size[0], padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D()(bigram_branch)
    trigram_branch = Conv1D(filters=n_filter[1], kernel_size=kernel_size[1], padding='valid', activation='relu', strides=1)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D()(trigram_branch)
    fourgram_branch = Conv1D(filters=n_filter[2], kernel_size=kernel_size[2], padding='valid', activation='relu', strides=1)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
    merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(dropout)(merged) if dropout > 0 else merged
    merged = Dense(3)(merged)
    output = Activation('sigmoid')(merged)
    model_ZW = Model(inputs=[tweet_input], outputs=[output])
    model_ZW.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy'])
    return model_ZW

    

Using TensorFlow backend.


# Load data

In [4]:
df_ = pd.read_csv("./data/tweet_no_dupid.tsv", sep="\t")

In [2]:
def df_from_tsv(path):
    data = []
    with open(path, "r") as f:
        for l in f:
            rec = l[:-1].split('\t')
            if len(rec) == 3:
                data.append(rec)
    return pd.DataFrame(columns=['id', 'label', 'tweet'], data=data)

In [3]:
train_dfs = [ df_from_tsv("./data/train/twitter-train"+str(i)+".txt") for i in range(6)]
train_df = pd.concat(train_dfs)
print("Train with dup has " + str(len(train_df)) + " records")
no_dup = train_df.groupby(as_index=False, by=['id']).first()
print("Train with no dup has " + str(len(no_dup)) + " records, "+str(len(train_df)-len(no_dup))+" less.")
no_dup.head()

Train with dup has 21826 records
Train with no dup has 21240 records, 586 less.


Unnamed: 0,id,label,tweet
0,100000794790727680,positive,One Night like In Vegas I make dat Nigga Famous
1,100000831528632320,positive,Walking through Chelsea at this time of day is...
2,100000950005145600,neutral,"""And on the very first play of the night, Aaro..."
3,100000974885748736,neutral,"""Drove the bike today, about 40 miles. Felt li..."
4,100001038454624257,negative,looking at the temp outside....hpw did it get ...


In [4]:
no_dup.to_csv("tweet-train-no_dup.tsv", sep="\t", index=False)
#nno_dup = pd.read_csv("tweet-train-no_dup.tsv", sep="\t")
#nno_dup.head()
#len(nno_dup)

In [5]:
tweet_df=createDatasets(no_dup,['positive','negative','neutral'],'label',[0.,0.8,0.2],shuffle=True)

train_sents, word_df, train_labels = load_sentences_from_df(tweet_df[1])
max_l = max(len(words) for words,l in train_sents)
print( "number of sentences: %d" % len(train_sents))
print( "vocab size: %d" % len(word_df))
print( "max sentence length: %d" % max_l)

number of sentences: 16991
vocab size: 42130
max sentence length: 33


In [5]:
tweet = tweet_df[1]
print("Train")
print(str(len(tweet[tweet['label'] == 'neutral']))+" neutral tweets "+
      str(len(tweet[tweet['label'] == 'neutral'])*100./len(tweet))[:5]+"% of total")
print(str(len(tweet[tweet['label'] == 'positive']))+" positive tweets "+
      str(len(tweet[tweet['label'] == 'positive'])*100./len(tweet))[:5]+"% of total")
print(str(len(tweet[tweet['label'] == 'negative']))+" negative tweets "+
      str(len(tweet[tweet['label'] == 'negative'])*100./len(tweet))[:5]+"% of total")

tweet = tweet_df[2]
print("Test")
print(str(len(tweet[tweet['label'] == 'neutral']))+" neutral tweets "+
      str(len(tweet[tweet['label'] == 'neutral'])*100./len(tweet))[:5]+"% of total")
print(str(len(tweet[tweet['label'] == 'positive']))+" positive tweets "+
      str(len(tweet[tweet['label'] == 'positive'])*100./len(tweet))[:5]+"% of total")
print(str(len(tweet[tweet['label'] == 'negative']))+" negative tweets "+
      str(len(tweet[tweet['label'] == 'negative'])*100./len(tweet))[:5]+"% of total")

Train
7017 neutral tweets 41.29% of total
7286 positive tweets 42.88% of total
2688 negative tweets 15.82% of total
Test
1754 neutral tweets 41.29% of total
1821 positive tweets 42.87% of total
672 negative tweets 15.82% of total


# Build embedding index from GloVe-Twitter-.27B-200d 

In [6]:
embeddings_index = {}
with open(TWEET_EMBEDDING_PATH, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [7]:
text = []
labls = []

for s in train_sents:
    text.append(s[0])
    labls.append(s[1])

print('Found %s tweets.' % len(train_sents))

Found 16991 tweets.


# Tokenization

In [8]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_labels = to_categorical(np.asarray(labls))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', train_labels.shape)

Found 42130 unique tokens.
Shape of data tensor: (16991, 40)
Shape of label tensor: (16991, 3)


# Build embedding matrix for CNN layer

In [9]:
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
high = 2.38 / np.sqrt(len(text) + EMBEDDING_DIM) # see (Bottou '88)
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector
    else: 
        embedding_matrix[i] = np.random.uniform(-high, high, EMBEDDING_DIM)


In [10]:
pickle.dump(file = open('embedding_matrixTW200', 'wb'), obj = embedding_matrix)
pickle.dump(file = open('train_data', 'wb'), obj = train_data)
pickle.dump(file = open('train_labels', 'wb'), obj = train_labels)

In [11]:
test_sents, test_word_df, test_labels_ = load_sentences_from_df(tweet_df[2])
test_data, test_labels = get_test_data(test_sents, tokenizer, word_index)

test_max_l = max(len(words) for words,l in test_sents)
print( "number of sentences: %d" % len(test_sents))
print( "vocab size: %d" % len(test_word_df))
print( "max sentence length: %d" % test_max_l)

pickle.dump(file = open('test_data', 'wb'), obj = test_data)
pickle.dump(file = open('test_labels', 'wb'), obj = test_labels)

Found 4247 tweets.
Found 42130 unique tokens.
Shape of data tensor: (4247, 40)
Shape of label tensor: (4247, 3)
number of sentences: 4247
vocab size: 14390
max sentence length: 31


# Grid search
We perform 3 5-fold cv for configuration $\theta$ and average its scores. *grid_result* is a dictionary with one entry for each combination $\theta_i$:

model_tag : cv_score, cv_std

where model_tag is a string that represents $\theta_i$ and has this format:

b *batch_size* -ep *epochs* -nf *n_filter* - ks *kernel_size* - emb *embedding* (see grid_search definition for details)

In [None]:
grid_result = dict({})
grid_resut = pickle.load(open("grid_result"))

In [None]:
#cross_validation(train_data, train_labels, n_repeat=4, n_fold=10)
param_grid = dict({
    "batch_size":[16,32,64,128],
    "epochs":[2],
    "n_filters" : [(100, 100, 100)],
    "kernel_sizes" : [(2, 3, 4)]
})

grid_result.update(grid_search(train_data, train_labels, n_fold=5, n_repeat = 3, **param_grid))

Started a 5-fold cv with b16-ep2-nf(100, 100, 100)-ks(2, 3, 4)
Epoch 1/2
Epoch 2/2
##################################################################
Step #1.1 (1/15) mavg_recall on validation = 0.596
##################################################################
Epoch 1/2

# Get GridSearch Results


In [None]:

grid_result_v = dict({})

for k in grid_result.keys():
    grid_result_v[k] = [grid_result[k][0],grid_result[k][1]]

sorted(grid_result.items(), key = operator.itemgetter(1), reverse=True)

In [None]:
bert_sents = [" ".join(text[i]) for i in range(len(text))]
input_sents = bert_embedding(bert_sents)

In [None]:
input_sents[0]

In [16]:
x_train, y_train, x_val, y_val = train_test_split_cv(train_data, train_labels, 1, n_fold=3)

In [21]:
test_sents, test_word_df, test_labels_ = load_sentences_from_df(tweet_df[2])
x_test, y_test = get_test_data(test_sents, tokenizer, word_index)

{'negative': 0, 'neutral': 1, 'positive': 2}
Found 12352 tweets.
Found 90630 unique tokens.
Shape of data tensor: (12352, 40)
Shape of label tensor: (12352, 3)


In [37]:
y_val_predZW = model_ZW.predict(x_val, batch_size=32, verbose=1)
y_val_modZW = to_category(y_val_predZW)


zwvrecalls, zwvavg_recall = macroaveraged_recall(y_val, y_val_modZW)
print(zwvrecalls)
print("Validation marco-averaged recall = "+str(zwvavg_recall)[:4])

zwvscores = model_ZW.evaluate(x_val, y_val, verbose=1)
print('Validation loss:', zwvscores[0])
print('Validation accuracy:', zwvscores[1])

y_test_predZW = model_ZW.predict(x_test, batch_size=32, verbose=1)
y_test_modZW = to_category(y_test_predZW)


zwrecalls, zwavg_recall = macroaveraged_recall(y_test, y_test_modZW)
print(zwrecalls)
print("Test marco-averaged recall = "+str(zwavg_recall)[:4])
# Score trained model.
zwscores = model_ZW.evaluate(x_test, y_test, verbose=1)
print('Test loss:', zwscores[0])
print('Test accuracy:', zwscores[1])

[0.4469895287958115, 0.6244518272425249, 0.7598913228052301]
Validation marco-averaged recall = 0.61
Validation loss: 0.805257450137344
Validation accuracy: 0.6399514268221969
[0.47898799313893653, 0.5756444444444444, 0.5899886234357224]
Test marco-averaged recall = 0.54
Test loss: 1.089451436242909
Test accuracy: 0.5625


In [31]:
SMS_PATH = "./data/sms-2013.tsv"
JOURNAL_PATH = "./data/livej_no_dup.tsv"

model_ = model_ZW

sms_sent, sms_word_df, sms_labels = load_sentences(SMS_PATH, tagField=2, textField=3)
j_sent, j_word_df, j_labels = load_sentences(JOURNAL_PATH, tagField=2, textField=1, no_dup = False)
x_sms, y_sms = get_test_data(sms_sent, tokenizer, word_index)
x_journ, y_journ = get_test_data(j_sent, tokenizer, word_index)

Found 2094 tweets.
Found 90630 unique tokens.
Shape of data tensor: (2094, 40)
Shape of label tensor: (2094, 3)
Found 1142 tweets.
Found 90630 unique tokens.
Shape of data tensor: (1142, 40)
Shape of label tensor: (1142, 3)


In [32]:
y_sms_pred = model_ZW.predict(x_sms, batch_size=32, verbose=1)
y_j_pred = model_ZW.predict(x_journ, batch_size=32, verbose=1)



In [35]:
y_sms_mod = to_category(y_sms_pred)
y_j_mod = to_category(y_j_pred)

srecalls, savg_recall = macroaveraged_recall(y_sms, y_sms_mod)
print(srecalls)
print("SMS marco-averaged recall = "+str(savg_recall)[:4])
sms_scores = model_ZW.evaluate(x_sms, y_sms, verbose=1)
print('Test loss:', sms_scores[0])
print('Test accuracy:', sms_scores[1])

jrecalls, javg_recall = macroaveraged_recall(y_journ, y_j_mod)
print(jrecalls)
print("J marco-averaged recall = "+str(javg_recall)[:4])
j_scores = model_ZW.evaluate(x_journ, y_journ, verbose=1)
print('Test loss:', j_scores[0])
print('Test accuracy:', j_scores[1])

[0.3629441624365482, 0.7706953642384106, 0.556910569105691]
SMS marco-averaged recall = 0.56
Test loss: 0.8319354089761531
Test accuracy: 0.6437440306773727
[0.40460526315789475, 0.6666666666666666, 0.7189695550351288]
J marco-averaged recall = 0.59
Test loss: 0.8358790920411225
Test accuracy: 0.6164623467600701


| Model | Validation | Test | SMS | LiveJournal |
|:-----:|:-------------------:|:-----------------:|:---------------:|:-----------------------:|
|  CNN_(3,4,5)x100 |         0.63 - 0.61         |        0.56 - 0.54       |     0.64 - 0.56      |          0.61 - 0.59          |
|  baseline |             ?   |        0.33       |       0.33      |           0.33          |
|  dummy |             ?   |        0.33       |       0.33      |           0.33          |

# Grid Search on most important parameters

The baseline configuration was ep2-nf(100, 100, 100)-ks(2, 3, 4) with no dropout. We tested:

- batch_size
- #epochs
- nf
- ker_size


| Model | Validation | 
|:-----:|:-------------------:|
batch_size 16  | 0.578 $ \pm $0.123
batch_size 32 | **0.649 $ \mathbf{\pm} $0.016**
batch_size 64 | 0.644 $ \pm $0.023
batch_size 128 | 0.630 $ \pm $0.040
batch_size 256 | 0.632 $ \pm $0.016
batch_size 512 | 0.473 $ \pm $0.135
batch_size 1024 | 0.446 $ \pm $0.133
batch_size 2048 | 0.343 $ \pm $0.037
batch_size 4096 | 0.364 $ \pm $0.043

| Model | Validation | 
|:-----:|:-------------------:|
ks(1, 1, 1) | 0.638 $ \pm $0.031
ks(2, 2, 2) | 0.637 $ \pm $0.014
ks(3, 3, 3) | 0.636 $ \pm $0.020
ks(4, 4, 4) | **0.643 $ \pm $0.028**
ks(5, 5, 5) | 0.634 $ \pm $0.018
ks(6, 6, 6) | 0.640 $ \pm $0.018
ks(7, 7, 7) | 0.575 $ \pm $0.115
ks(8, 8, 8) | 0.635 $ \pm $0.024
ks(9, 9, 9) | **0.644 $ \pm $0.020**
ks(10, 10, 10) | **0.644 $ \pm $0.017**


In [20]:
model_wrapped = KerasClassifier(build_fn=create_model, verbose=1)
mar_score = make_scorer(macroaveraged_recall, greater_is_better=True)

In [20]:
cv_results = cross_validate(model_wrapped, x_train, y_train,scoring=mar_score, cv=5, verbose = 1, n_jobs=-1)

NameError: name 'model_wrapped' is not defined

In [52]:
x_ciao, y_ciao = get_test_data([(["I", "am", "so", "safas", "that", "I", "will", "#happyness"],1)], tokenizer, word_index)

Found 1 tweets.
Found 90630 unique tokens.
Shape of data tensor: (1, 40)
Shape of label tensor: (1, 2)


In [53]:
model_ZW.predict(x_ciao, batch_size=32, verbose=1)



array([[0.28645733, 0.62810594, 0.52667683]], dtype=float32)