# Functions

## Data loading

In [26]:
import pickle

def load_dataset(file):
    load_file = open(file, "rb")
    dataset = pickle.load(load_file)
    load_file.close()
    return dataset

def dump_dataset(dataset, file):
    file = open(file, "wb")
    pickle.dump(dataset, file)
    file.close()

def dump_features(feature_array, file):
    write_file = open(file, "wb")
    pickle.dump(feature_array, write_file)
    write_file.close()

def load_features(file):
    load_file = open(file, "rb")
    features = pickle.load(load_file)
    load_file.close()
    return features

def dump_labels(labels, file):
    write_file = open(file, "wb")
    pickle.dump(labels, write_file)
    write_file.close()


def load_labels(file):
    load_file = open(file, "rb")
    labels = pickle.load(load_file)
    load_file.close()
    return labels

## Various

In [27]:
from nltk.stem import WordNetLemmatizer
import spacy
from hooks.pretokenization import *
from hooks.posttokenization import *
from hooks.spell_check import *
from hooks.annotation_normalization import *
import numpy as np
from sklearn.model_selection import train_test_split
import copy

nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])

def tokenize(raw, tokenizer="split"):
    if tokenizer == "spacy":
        return [token.text for token in nlp.tokenizer(raw)]
    if tokenizer == "split":
        return raw.split(" ")


lemmatizer = WordNetLemmatizer()

def lemmatize(tokens):
    lem = list()
    for token in tokens:
        lem.append(lemmatizer.lemmatize(token))
    return lem


def build_vocab(data):
    vocab = dict()
    index = 1
    for sent in data:
        for word in sent:
            if word not in vocab.keys():
                vocab.update({word: index})
                index += 1
    return vocab

def encode_sentence(sentence, vocab):
    encoded = list()
    for word in sentence:
        if word in vocab.keys():
            encoded.append(vocab[word])
        else:
            encoded.append(0)
    return encoded


def encode_data(data, vocab):
    encoded_data = list()
    for sent in data:
        encoded_data.append(encode_sentence(sent, vocab))
    return encoded_data


def split_train_validate_test(data, labels, train_valtest_ratio, validate_test_ratio, random_state=42):
    X_train, X_valtest, y_train, y_valtest = train_test_split(data, labels, test_size=train_valtest_ratio,
                                                              random_state=random_state)
    X_validate, X_test, y_validate, y_test = train_test_split(X_valtest, y_valtest, test_size=validate_test_ratio,
                                                              random_state=random_state)

    return X_train, X_validate, X_test, y_train, y_validate, y_test

def process_dataset(data):
    dataset = list()
    for i in tqdm(range(len(data))):
        new_tweet = repair_chars(data[i])
        # anot = copy.deepcopy(new_tweet)
        anot = new_tweet
        # new_tweet = remove_usernames(new_tweet)
        # new_tweet = remove_links(new_tweet)
        # new_tweet = remove_punctuation(new_tweet)
        #
        # tweet_tokens = tokenize(new_tweet, tokenizer="spacy")
        # tweet_tokens = remove_stopwords(raw="", tokenized=tweet_tokens)

        anot = annotation_normalization(anot)
        anot_tokens = tokenize(anot, tokenizer="split")
        anot_tokens = spell_check_tokens(anot_tokens)
        anot_tokens = replace_slang_tokens(anot_tokens)
        anot_tokens = remove_stopwords_tokens(anot_tokens)
        # anot_tokens = lemmatize(anot_tokens)

        # dataset.append({"tweet": new_tweet, "tweet_tokens": tweet_tokens, "anot": anot, "anot_tokens": anot_tokens})
        dataset.append({"anot_tokens": anot_tokens})
    return dataset


def create_vocab_encode_data(tokens):
    vocab = build_vocab(tokens)
    encoded_data = encode_data(data=tokens, vocab=vocab)
    return vocab, encoded_data


def pad_encoded_data(encoded, seq_length):
    features = np.zeros((len(encoded), seq_length), dtype=int)
    for i, review in enumerate(encoded):
        if len(review) > seq_length:
            review = review[:seq_length]
        zeroes = list(np.zeros(seq_length - len(review)))
        new = zeroes + review
        features[i, :] = np.array(new)
    return features


def make_embedding_matrix(vocab, embedding_dim=300):
    hits, misses = 0, 0
    embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))
    for word, i in vocab.items():
        token = nlp(word)
        if token.has_vector:
            embedding_matrix[i] = token.vector
            hits += 1
        else:
            misses += 1

    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix


# Model

In [32]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint

def compile_model(vocab, embedding_matrix, input_length,
                  trainable=False,
                  recurrent_layer_size=256,
                  dense_size=256,
                  dropout=0.1,
                  recurrent_dropout=0.1,
                  dense_activation='relu',
                  dropout_for_regularization=0.5,
                  output_activation='softmax',
                  optimizer='adam',
                  metrics = tf.keras.metrics.Recall(),
                  loss=tf.keras.losses.SquaredHinge(reduction="auto", name="squared_hinge")):

    model = Sequential()
    # Embedding layer
    model.add(
        Embedding(input_dim=len(vocab) + 1,
                  input_length=input_length,
                  output_dim=300,
                  weights=[embedding_matrix],
                  trainable=False,
                  mask_zero=True))

    # Masking layer for pre-trained embeddings
    model.add(Masking(mask_value=0.0))

    # Recurrent layer
    model.add(LSTM(recurrent_layer_size, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout,
                   input_shape=(2048, 28, 300)))
    # model.add(LSTM(recurrent_layer_size, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout))

    # # Second layer (batch_size, sequence_length, features)
    # model.add(LSTM(int(recurrent_layer_size / 4), return_sequences=True))
    # model.add(LSTM(int(recurrent_layer_size / 8), return_sequences=False))

    # Fully connected layer
    model.add(Dense(dense_size, activation=dense_activation))

    # Dropout for regularization
    model.add(Dropout(dropout_for_regularization))

    # Output layer
    model.add(Dense(3, activation=output_activation))

    # Compile the model
    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=[metrics]
        )
    return model


# CHANGE PATH FOR SERVER
local = "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/models/model.h5"
djurdja = '/home/ikrizanic/pycharm/zavrsni/models/model.h5'
callbacks = [EarlyStopping(monitor='val_loss', patience=20),
             ModelCheckpoint(local)]


def fit_model(model, X_train, y_train, X_val, y_val, batch_size=2048, epochs = 10):
    history = model.fit(X_train, y_train,
                        batch_size=batch_size, epochs=epochs,
                        callbacks=callbacks,
                        validation_data=(X_val, y_val))
    return history, model


def evaluate_model(model, X_test, y_test):
    res = model.evaluate(X_test, y_test)
    return res


# Test

## Load data

In [28]:
import pandas as pd

local_path = "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data"
djurdja_path = "/home/ikrizanic/pycharm/zavrsni/data"
working_path = local_path
data_paths = {"train_dataset": working_path + "/lstm/train_dataset.pl",
              "input_dataset": working_path + "/lstm/input_dataset.pl",
              "input_labels": working_path + "/lstm/input_labels.pl",
              "test_dataset": working_path + "/lstm/test_dataset.pl",
              "train_labels": working_path + "/lstm/train_labels.pl",
              "test_labels": working_path + "/lstm/test_labels.pl",
              "embedding_matrix": working_path + "/lstm/embedding_matrix.pl"}

dataset_name = "main" + "_data"
djurdja_paths = {"dataset": str("~/pycharm/zavrsni/data/" + dataset_name + ".csv"),
                 "labels": "/home/ikrizanic/pycharm/zavrsni/data/labels.txt"}
local_paths = {
    "dataset": str("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/" + dataset_name + ".csv"),
    "labels": "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/labels.txt"}

train_dataset = pd.read_csv(local_paths["dataset"], sep="\t", names=["label", "text"])
# train_dataset = pd.read_csv(
# '/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/SemEval2017-task4-dev.subtask-A.english.INPUT.txt',
#                             sep="\t", quotechar='"',
#                             names=["id", "label", "text"])

test_dataset = pd.read_csv(
    '/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/SemEval2017-task4-test.subtask-A.english.txt',
    sep="\t", quotechar='\'', names=["id", "label", "text"])
#
# train_dataset, train_labels = return_tweets_and_labels(train_dataset)
# test_dataset, test_labels = return_tweets_and_labels(test_dataset)
#
# train_dataset = process_dataset(train_dataset)
# test_dataset = process_dataset(test_dataset)
#
# dump_dataset(train_dataset, data_paths["train_dataset"])
# dump_dataset(test_dataset, data_paths["test_dataset"])
# dump_labels(train_labels, data_paths["train_labels"])
# dump_labels(test_labels, data_paths["test_labels"])

print("Reading pickle data...")
train_dataset = load_dataset(data_paths["train_dataset"])
test_dataset = load_dataset(data_paths["test_dataset"])
train_labels = load_labels(data_paths["train_labels"])
test_labels = load_labels(data_paths["test_labels"])
print("Done")

Reading pickle data...
Done


## Embed

In [29]:
print("Train vocab and data encoding...")
train_vocab, enc_train_data = create_vocab_encode_data([d["anot_tokens"] for d in train_dataset])
enc_test_data = encode_data([d["anot_tokens"] for d in test_dataset], train_vocab)
print("Done")

print("Padding features...")
train_features = pad_encoded_data(enc_train_data, max(x for x in [len(d) for d in enc_train_data]))
test_features = pad_encoded_data(enc_test_data, max(x for x in [len(d) for d in enc_train_data]))
print("Done")



Train vocab and data encoding...
Done
Padding features...
Done


In [None]:
print("Ebedding matrix...")
embedding_matrix = make_embedding_matrix(train_vocab)
print("Done")

In [30]:
from keras.utils import to_categorical

X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.2, random_state=25, shuffle = True)

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

test_labels = to_categorical(test_labels)

max_len = max(x for x in [len(d) for d in enc_train_data])

## Run model

In [33]:
cce = tf.keras.losses.CategoricalCrossentropy()
sq_hinge = tf.keras.losses.SquaredHinge(reduction="auto", name="squared_hinge")
model = compile_model(train_vocab, embedding_matrix, max_len,
                      recurrent_layer_size=1024,
                      dense_size=1024,
                      dropout=0,
                      recurrent_dropout=0,
                      dense_activation='relu',
                      dropout_for_regularization=0,
                      output_activation='softmax',
                      optimizer='Adam',
                      loss=tf.keras.losses.CategoricalCrossentropy()
                      )

history, model = fit_model(model, X_train, y_train, X_val, y_val, batch_size=2048, epochs=100)



Train on 31685 samples, validate on 7922 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


In [34]:
res = model.evaluate(test_features, test_labels)
print(res)

[2.250463369056439, 0.7539452910423279]


In [25]:
import sklearn
from sklearn import metrics

def custom_metric(y_true, y_pred):
    score = sklearn.metrics.recall_score(y_pred, y_pred, average=None)
    return score

In [35]:
import numpy as np

predictions = model.predict(test_features)


c = np.eye(3, dtype=float)[np.argmax(predictions, axis=1)]

tp, tn, tu, np, nn, nu = 0, 0, 0, 0, 0, 0
for i in range(len(predictions)):
    p = list(c[i])
    l = list(test_labels[i])

    if l == [1.0,0.0,0.0]:
        nn += 1
        if l == p:
            tn += 1
    if l == [0.0,1.0,0.0]:
        nu += 1
        if l == p:
            tu += 1
    if l == [0.0,0.0,1.0]:
        np += 1
        if l == p:
            tp += 1
    
rp = tp/np
ru = tu/nu
rn = tn/nn
print(rp)
print(ru)
print(rn)
print((rp + rn + ru) * 100 / 3)

0.5931558935361216
0.6153197233001518
0.5676084762865792
59.20280310409509


In [None]:
print(tn)
print(tu)
print(tp)
print(nn)
print(nu)
print(np)


In [None]:
print(good)
print(bad)
print(good / (good + bad))