# Preparation

## Imports

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings; warnings.simplefilter('ignore')
import re

import pickle

# NLTK
import nltk
import codecs
from nltk.tokenize import PunktSentenceTokenizer,sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("wordnet")

# Spacy
import spacy
nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])

## Loading data

In [131]:
dataset_name = "main" + "_data"
djurdja_paths = {"dataset": str("~/pycharm/zavrsni/data/" + dataset_name + ".csv"),
                 "labels": "/home/ikrizanic/pycharm/zavrsni/data/labels.txt"}
local_paths = {"dataset": str("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/" + dataset_name + ".csv"),
               "labels": "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/labels.txt"}


raw_data_merge = pd.read_csv(local_paths["dataset"], sep="\t", names=["label", "text"])
emoticons_file = pd.read_csv('../data/emoticons.txt', sep="  ->  ", names=["emoji", "meaning"])

test_dataset = pd.read_csv('/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/SemEval2017-task4-test.subtask-A.english.txt',
                          sep="\t", quotechar='\'', names=["id", "label", "text"])

'Yes we can became hate your neighbor. I am disgusted by our prez and vp elect. #thankyouobama #unclejoe #lovetrumpshate #dontbesilent'

In [51]:
def remove_duplicates(csv_data):
    clean_data = dict()
    for i in tqdm((range(len(csv_data)))):
        if csv_data.text[i] not in clean_data and isinstance(csv_data.text[i], str):
            clean_data.update({csv_data.text[i]: csv_data.label[i]})
    return clean_data

In [175]:
data = remove_duplicates(raw_data_merge)

tweets = list()
labels = list()

for text in data.keys():
    tweets.append(text)
    
for polarity in data.values():
    if polarity == "positive":
        labels.append(2)
    elif polarity == "neutral":
        labels.append(1)
    else:
        labels.append(0)
        
labels.pop(0)

100%|██████████| 40256/40256 [00:02<00:00, 18885.99it/s]


0

In [132]:
data = remove_duplicates(test_dataset)

test_labels = list()

for polarity in data.values():
    if polarity == "positive":
        test_labels.append(2)
    elif polarity == "neutral":
        test_labels.append(1)
    else:
        test_labels.append(0)
        
test_labels.pop(0)

100%|██████████| 12258/12258 [00:00<00:00, 18523.46it/s]


1

[2,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 2,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 0,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 0,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 2,


In [54]:
print(len(labels))
len(tweets)

11905


11906

# Special functions

## Tokenize

In [5]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])

In [6]:
def tokenize(raw, tokenizer="split"):
    if tokenizer == "spacy":
        return [token.text for token in nlp.tokenizer(raw)]
    if tokenizer == "split":
        return raw.split(" ")

# Hooks

## Pretokenization

### Removing links

In [7]:
def remove_links(raw):
    return re.sub(r'http.*\b', '[URL]', raw)

### Repairing some lost characters

In [8]:
def repaire_chars(raw):
    raw = re.sub(r'\\u2019', "\'", raw)
    raw = re.sub(r'\\u002c', ',', raw)
    raw = re.sub(r'&lt', '>', raw)
    raw = re.sub(r'&gt', '<', raw)        
    raw = re.sub(r'&amp;', '&', raw)
    raw = re.sub(r'\\\"\"', '', raw)
    raw = re.sub(r'\"\"\\', '', raw)
    return raw

### Removing usernames

In [9]:
def remove_usernames(raw):
    return re.sub(r'@[^\s]*', '[USER]', raw)

### Replacing unuseful emoticons

In [10]:
def replace_unuseful_emoticons(raw):
    for k,v in EMOTICONS.items():
        if k in useful_emoticons.keys() and k in raw:
            raw.replace(k,v)
    return raw

### Annotation and normalization (from github)

In [11]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [12]:
def annotation_normalization(raw):
    return " ".join(text_processor.pre_process_doc(raw))

### Remove punctation

In [13]:
import string
def remove_punctuation(raw):
    return raw.translate(str.maketrans('', '', string.punctuation))

### Spell check (from github)

In [14]:
import csv
import nltk
from ekphrasis.classes.spellcorrect import SpellCorrector
from nltk.corpus import words

djurdja = "/home/ikrizanic/pycharm/zavrsni/data/slang.csv"
local = "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/slang.csv"

with open(local, mode='r') as infile:
    reader = csv.reader(infile, delimiter=';')
    slang_dict = dict(reader)

sp = SpellCorrector(corpus="english")
nltk.download("words")
words = set(words.words())
punctuations = '''!()-[]{};:\'"\,<>./?@#$%^&*_~'''


Reading english - 1grams ...


[nltk_data] Downloading package words to /home/ivan/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [15]:
def spell_check(raw):
    correct_raw = list()
    for word in raw.split(" "):
        if word in words:
            correct_raw.append(word)
        else:
            correct_raw.append(sp.correct(word))
    return " ".join(correct_raw)

## Posttokenization

### Spell check on tokens

In [16]:
def spell_check_tokens(tokens):
    correct_tokens = list()
    for token in tokens:
        if token.strip()[0] == "<":
            continue
        if token.strip()[0] in punctuations:
            continue
        if token.strip() in words:
            correct_tokens.append(token.strip())
        else:
            correct_tokens.append(sp.correct(token.strip()))
    return correct_tokens

### Replace slang

In [17]:
def replace_slang(tokenized):
    tokens = []
    for token in tokenized:
        if token not in words:
            for key, value in slang_dict.items():
                if str(key).lower() == str(token).lower():
                    token = value.split(" ")
            if type(token) is list:
                tokens.extend(token)
            else:
                tokens.append(token)
        else:
            tokens.append(token)
    return tokens

### Removing stopwords

In [18]:
def remove_stopwords(tokens):
    stop_words_set = set(stopwords.words('english'))
    new_tokens = []
    for token in tokens:
        token_lower = token if token.islower() else token.lower()
        if token_lower not in stop_words_set:
            new_tokens.append(token)
    return new_tokens

### Lemmatizer

In [19]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

def lemmatize(tokens):
    lem = list()
    for token in tokens:
        lem.append(lemmatizer.lemmatize(token))
    return lem

### Build vocab

In [20]:
def build_vocab(data):
    vocab = dict()
    index = 1
    for sent in data:
        for word in sent:
            if word not in vocab.keys():
                vocab.update({word: index})
                index += 1     
    return vocab

### Encode sentence

In [62]:
def encode_sentence(sentence, vocab):
    encoded = list()
    for word in sentence:
        if word in vocab.keys():
            encoded.append(vocab[word])
        else:
            encoded.append(0)
    return encoded

In [63]:
def encode_data(data, vocab):
    encoded_data = list()
    for sent in data:
        encoded_data.append(encode_sentence(sent, vocab))
    return encoded_data

### Split data

In [139]:
from sklearn.model_selection import train_test_split
def split_train_validate_test(data, labels, train_valtest_ratio, validate_test_ratio, random_state=42):
    X_train, X_valtest, y_train, y_valtest = train_test_split(data, labels, test_size=train_valtest_ratio, random_state=random_state)
    X_validate, X_test, y_validate, y_test = train_test_split(X_valtest, y_valtest, test_size=validate_test_ratio, random_state=random_state)
    
    return X_train, X_validate, X_test, y_train, y_validate, y_test

# Data processing

## Producing dataset

### New dataset

In [138]:
import copy
dataset_raw = list()

for i in tqdm(range(len(tweets) - 1)):
    new_tweet = repaire_chars(tweets[i+1])
    anot = copy.deepcopy(new_tweet)
    
    new_tweet = remove_usernames(new_tweet)
    new_tweet = remove_links(new_tweet)
    new_tweet = remove_punctuation(new_tweet)
    
    tweet_tokens = tokenize(new_tweet, tokenizer="spacy")
    tweet_tokens = remove_stopwords(tweet_tokens)
    
    anot = annotation_normalization(anot)
    anot_tokens = tokenize(anot, tokenizer="split")
    anot_tokens = spell_check_tokens(anot_tokens)
    anot_tokens = replace_slang(anot_tokens)
    anot_tokens = remove_stopwords(anot_tokens)
    anot_tokens = lemmatize(anot_tokens)
    
    
    
    dataset_raw.append({"tweet": new_tweet, "tweet_tokens": tweet_tokens, "anot": anot, "anot_tokens": anot_tokens})
#     dataset.append({"tweet": new_tweet, "anot_tokens": anot_tokens})
    

100%|██████████| 12257/12257 [01:12<00:00, 167.93it/s]


### Pickle

#### Load dataset from file

In [151]:
import pickle
# load_file = open("/home/ikrizanic/pycharm/zavrsni/data/dataset_dump.pkl", "rb")
load_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/dataset_dump.pickle", "rb")
dataset_raw = pickle.load(load_file)
load_file.close()

#### Dump dataset to file

### Print dataset

In [109]:
for d in dataset_raw:
#     print(d["tweet"])
#     print(d["tweet_tokens"])
#     print(d["anot"])
    print(d["anot_tokens"])
#     print(" ".join(d["anot_tokens"]))
#     print("="*50)


['ariana', 'grande', 'kiis', 'f**k', 'truly', 'cd', 'listening', 'party', 'burbank', 'ariana', 'grande']
['ariana', 'grande', 'white', 'house', 'easter', 'egg', 'roll', 'washington', 'ariana', 'grande']
['cd', 'music', 'ariana', 'grande', 'sweet', 'like', 'candy', 'oz', 'ml', 'sealed', 'box', 'authentic', 'new']
['side', 'side', 'side', 'side', 'ariana', 'grande', 'musically', 'comunidad', 'gay', 'lgbt', 'lot']
['hairspray', 'live', 'preview', 'macy', 'thanksgiving', 'day', 'parade', 'ariana', 'grande', 'television', 'nbc']
['lindsay', 'lohan', 'feeling', 'thankful', 'blasting', 'ariana', 'grande', 'wearing', 'much']
['hate', 'love', 'song', 'dammit', 'ariana', 'grande']
['ariana', 'grande', 'right', 'f**k', 'big', 'sean', 'アリアナ', 'ariana', 'grande']
['one', 'would', 'prefer', 'listen', 'whole', 'day', 'could', 'never', 'choose', 'ariana', 'grande', 'side', 'side', 'song', 'poll']
['booty', 'baby', 'ari', 'ariana', 'grande', 'princess', 'ari', 'booty', 'baby', 'dangerous', 'woman', 'to

['overturning', 'electoral', 'college', 'faithless', 'electoral', 'college', 'elector', 'anything', 'avert', 'planet', 'destroying', 'calamity']
['anti', 'trump', 'organizer', 'want', 'electoral', 'college', 'vote', 'election', 'electoral', 'college', 'buzz']
['u', 'electoral', 'college', 'rubber', 'stamp', 'say', 'talbott', 'news', 'timesofindia']
['electoral', 'college', 'member', 'try', 'take', 'victory', 'trump']
['finally', 'someone', 'understands', 'electoral', 'college']
['electoral', 'college', 'mean', 'hamilton', 'elector']
['understand', 'electoral', 'college', 'set']
['electoral', 'college', 'really', 'acting', 'benefit', 'people', 'legally']
['trump', 'unguarded', 'twitter', 'ranting', 'china', 'genuinely', 'dangerous', 'point', 'electoral', 'college', 'bar', 'someone', 'like']
['scare']
['sally', 'field', 'understand', 'electoral', 'college', 'instead', 'blabbing', 'stupid', 'opinion', 'pres', 'trump', 'go', 'prather', 'univ']
['blocking', 'wi', 'mi', 'pa', 'fl', 'appointi

['trump', 'transition', 'news', 'trump', 'pick', 'deputy', 'national', 'security', 'adviser', 'white', 'house', 'counsel']
['hearing', 'lot', 'fake', 'news', 'trump', 'transition', 'activity', 'doubt', 'russia']
['think', 'stop', 'lamenting', 'pace', 'trump', 'transition', 'quite', 'clear', 'real', 'transition']
['damn', 'smooth', 'transition', 'nothing', 'smooth', 'racism', 'trump', 'give']
['still', 'tune', 'watch', 'talk', 'trump', 'transition']
['hillary', 'need', 'fire', 'transition', 'team', 'would', 'certainly', 'get', 'trump', 'back', 'tweet']
['press', 'critical', 'pres', 'elect', 'trump', 'done', 'lot', 'first', 'week', 'transition', 'medium', 'fails']
['sen', 'warren', 'want', 'investigation', 'trump', 'transition', 'team', 'would', 'look', 'holder', 'powell', 'hillary', 'aide', 'afraid']
['trump', 'state', 'transitiondavid', 'petraeus', 'john', 'bolton', 'would', 'strong', 'experienced', 'diplomat', 'editorial']
['pocahontas', 'looking', 'america', 'want', 'b', 'sure', 'tru

### Data frame

In [60]:
pd.options.display.max_colwidth = 200
%matplotlib inline
corpus = np.array([d["anot_tokens"] for d in dataset_raw])
labels = np.array(labels)
corpus_df = pd.DataFrame({"text": corpus,
                         "label": labels})
corpus_df = corpus_df[['text', 'label']]

### Create vocab and encode tweets

In [152]:
vocab = build_vocab([d["anot_tokens"] for d in dataset_raw])

In [28]:
encoded_data = encode_data([d["anot_tokens"] for d in dataset_raw], vocab)

In [140]:
encoded_test_data = encode_data([d["anot_tokens"] for d in dataset_raw], vocab)

In [141]:
encoded_test_data

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2],
 [1, 2, 10, 11, 12, 13, 14, 15, 1, 2],
 [6, 16, 1, 2, 17, 18, 19, 20, 21, 22, 23, 24, 25],
 [26, 26, 26, 26, 1, 2, 27, 28, 29, 30, 31],
 [32, 33, 34, 35, 36, 37, 38, 1, 2, 39, 40],
 [41, 42, 43, 44, 45, 1, 2, 46, 47],
 [48, 49, 50, 51, 1, 2],
 [1, 2, 52, 4, 53, 54, 55, 1, 2],
 [56, 57, 58, 59, 60, 37, 61, 62, 63, 1, 2, 26, 26, 50, 64],
 [65, 66, 67, 1, 2, 68, 67, 65, 66, 69, 70, 71, 69, 70],
 [41, 42, 72, 73, 74, 75, 76, 77, 1, 2, 78, 79],
 [80, 81, 82, 83, 84, 1, 2],
 [1, 2, 85, 86, 1, 2],
 [16, 1, 2, 87, 88, 6, 89, 90, 91],
 [92, 1, 2, 93, 94, 88, 6, 95, 96, 97, 98, 99],
 [100, 50, 101, 102, 103, 104, 100, 50, 105, 1, 2, 83, 16],
 [1, 2, 106, 107, 108, 88, 1, 2],
 [92, 1, 2, 87, 88, 6, 89, 97, 98, 99],
 [56, 109, 110, 1, 2],
 [25, 111, 112, 93, 113, 1, 2, 114, 115, 116, 117, 118, 119, 120],
 [47, 49, 70, 121, 1, 2],
 [1, 122, 123, 1, 1, 2],
 [1, 2, 124, 125, 126, 127, 128, 129, 1, 2],
 [130, 131, 132, 36, 1, 2, 133],
 [16, 1, 2, 134, 1, 2, 66],
 [2

# Feature extraction

## Pad encoded tweets

In [118]:
def pad_features(encoded, seq_length):
    features = np.zeros((len(encoded), seq_length), dtype = int)
    for i, review in enumerate(encoded):
        if len(review) > seq_length:
            review = review[:28]
        zeroes = list(np.zeros(seq_length - len(review)))
        new = zeroes + review        
        features[i,:] = np.array(new)
    
    return features

In [30]:
max_len = max(x for x in [len(d) for d in encoded_data])
features = pad_features(encoded_data, max_len)

In [142]:
max_len = max(x for x in [len(d) for d in encoded_test_data])
test_features = pad_features(encoded_test_data, 28)

In [115]:
test_features

array([[    0,     0,     0, ...,     9,     1,     2],
       [    0,     0,     0, ...,    15,     1,     2],
       [    0,     0,     0, ...,    23,    24,    25],
       ...,
       [    0,     0,     0, ...,  2343, 15319, 15320],
       [    0,     0,     0, ...,  2493, 15393,  5326],
       [    0,     0,     0, ..., 15320,   389, 15396]])

In [31]:
x_train, x_validate, x_test, y_train, y_validate, y_test = split_train_validate_test(features, labels, 0.4, 0.5)

## Embed vocabulary

In [32]:
def embed_vocab(vocab, embedding_dim = 300):
    hits, misses = 0, 0
    embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))
    for word, i in vocab.items():
        token = nlp(word)
        if token.has_vector:
            embedding_matrix[i] = token.vector
            hits += 1
        else:
            misses += 1
    
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix

In [145]:
embedding_matrix = embed_vocab(vocab)

KeyboardInterrupt: 

In [146]:
load_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/embedding_matrix.pickle", "rb")
embedding_matrix = pickle.load(load_file)
load_file.close()

In [34]:
file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/embedding_matrix.pickle", "wb")
pickle.dump(embedding_matrix, file)
file.close()

### Spacy glove

In [26]:
# doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in corpus])

### Dump features

In [None]:
write_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/spacy_features.pickle", "wb")
pickle.dump(feature_array, write_file)
write_file.close()

### Load features

In [13]:
load_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/spacy_features.pickle", "rb")
features = pickle.load(load_file)
load_file.close()

# Model

## Split features on train, validate and test

In [87]:
x_train, x_validate, x_test, y_train, y_validate, y_test = split_train_validate_test(features, labels, 0.2, 0.1)

In [35]:
import torch
from torch.utils.data import DataLoader, TensorDataset# create Tensor datasets

train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_validate), torch.from_numpy(y_validate))
test_data = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [150]:
len(vocab)

15396

In [71]:
# obtain one batch of training data
# dataiter = iter(train_loader)
# sample_x, sample_y = dataiter.next()
# print('Sample input size: ', sample_x.size()) # batch_size, seq_length
# print('Sample input: \n', sample_x)
# print()
# print('Sample label size: ', sample_y.size()) # batch_size
# print('Sample label: \n', sample_y)

## Model class

In [165]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
import tensorflow as tf
model = Sequential()
m = tf.keras.metrics.Recall()
# Embedding layer
model.add(
    Embedding(input_dim=len(vocab) + 1,
              input_length =28,
              output_dim=300,
              weights=[embedding_matrix],
              trainable=False,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(128, return_sequences=False, dropout=0.01, recurrent_dropout=0.01))

# Fully connected layer
model.add(Dense(128, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss=tf.keras.losses.Hinge(reduction="auto", name="hinge"), metrics=[m])

In [37]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5), 
             ModelCheckpoint('../models/model.h5')]

In [166]:
model.save_weights('../models/weights.h5')

## Training the network

In [88]:
print(len(x_test))
print(len(x_train))
print(len(x_validate))
print(len(y_test))
print(len(y_train))
print(len(y_validate))

793
31685
7129
793
31685
7129


In [89]:
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_validate = to_categorical(y_validate)

In [134]:
test_labels

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [133]:
test_labels = to_categorical(test_labels)

In [183]:
labels

[0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,


In [226]:
loss = tf.keras.losses.SquaredHinge(reduction="auto", name="squared_hinge")

In [227]:
from sklearn.model_selection import KFold
result = list()

kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(features)
labels = np.array(labels)

for train_index, test_index in kf.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    
    model = Sequential()
    m = tf.keras.metrics.Recall()
    # Embedding layer
    model.add(
        Embedding(input_dim=len(vocab) + 1,
                  input_length =28,
                  output_dim=300,
                  weights=[embedding_matrix],
                  trainable=False,
                  mask_zero=True))

    # Masking layer for pre-trained embeddings
    model.add(Masking(mask_value=0.0))

    # Recurrent layer
    model.add(LSTM(128, return_sequences=False, 
                   dropout=0.01, recurrent_dropout=0.01))

    # Fully connected layer
    model.add(Dense(128, activation='relu'))

    # Dropout for regularization
    model.add(Dropout(0.3))

    # Output layer
    model.add(Dense(3, activation='softmax'))

    # Compile the model
    model.compile(
    optimizer='adam', loss=loss, metrics=[m])
       
    model.fit(X_train,  y_train, 
                batch_size=2048, epochs=150,
                callbacks=callbacks,
                validation_data=(X_test, y_test))
    result.append(dict(zip(model.metrics_names, model.evaluate(test_features, test_labels))))

Train on 31685 samples, validate on 7922 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Train on 31685 samples, validate on 7922 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Train on 31686 samples, validate on 7921 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Train on 31686 samples, validate on 7921 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14

In [239]:
res = list()
for i in range(5):
    res.append(list(result[i].values())[1])
res = np.array(res)
print("Rezultat cross-validacije: {}%    +/-    {}% ".format(res.mean() * 100, res.std() * 100))
result


Rezultat cross-validacije: 66.67494297027588%    +/-    0.7234835165949747% 


[{'loss': 1.4108915974384442, 'recall_37': 0.6744864583015442},
 {'loss': 1.4013862124836989, 'recall_38': 0.6542628407478333},
 {'loss': 1.413778785693662, 'recall_39': 0.6731648445129395},
 {'loss': 1.4114211455560133, 'recall_40': 0.6646140813827515},
 {'loss': 1.4201690240134672, 'recall_41': 0.6672189235687256}]

## Test on semeval test data

### Load/dump test data

In [None]:
import pickle
load_embed = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/semeval_test_embed.pickle", "rb")
load_labels = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/semeval_test_labels.pickle", "rb")
semeval_test_embed = pickle.load(load_embed)
semeval_test_lebels = pickle.load(load_labels)
load_file.close()

In [158]:
load_embed = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/semeval_test_embed.pickle", "wb")
load_labels = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/semeval_test_labels.pickle", "wb")
pickle.dump(test_features, load_embed)
pickle.dump(test_labels, load_labels)
file.close()

In [220]:
test_result = model.evaluate(test_features, test_labels)
dict(zip(model.metrics_names, test_result))



{'loss': 0.3287860300701011, 'recall_29': 0.6172352433204651}