# Preparation

## Imports

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings; warnings.simplefilter('ignore')
import re

import pickle

# NLTK
import nltk
import codecs
from nltk.tokenize import PunktSentenceTokenizer,sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("wordnet")

# Spacy
import spacy
nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])

## Loading data

In [2]:
dataset_name = "main" + "_data"
djurdja_paths = {"dataset": str("~/pycharm/zavrsni/data/" + dataset_name + ".csv"),
                 "labels": "/home/ikrizanic/pycharm/zavrsni/data/labels.txt"}
local_paths = {"dataset": str("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/" + dataset_name + ".csv"),
               "labels": "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/labels.txt"}


raw_data_merge = pd.read_csv(local_paths["dataset"], sep="\t", names=["label", "text"])
emoticons_file = pd.read_csv('../data/emoticons.txt', sep="  ->  ", names=["emoji", "meaning"])

In [3]:
def remove_duplicates(csv_data):
    clean_data = dict()
    for i in tqdm((range(len(csv_data)))):
        if csv_data.text[i] not in clean_data and isinstance(csv_data.text[i], str):
            clean_data.update({csv_data.text[i]: csv_data.label[i]})
    return clean_data

data = remove_duplicates(raw_data_merge)

tweets = list()
labels = list()

for text in data.keys():
    tweets.append(text)
    
for polarity in data.values():
    if polarity == "positive":
        labels.append(2)
    elif polarity == "neutral":
        labels.append(1)
    else:
        labels.append(0)
        
labels.pop(0)

100%|██████████| 40256/40256 [00:02<00:00, 18851.73it/s]


0

In [45]:
print(len(labels))
len(tweets)

39607


39608

# Special functions

## Tokenize

In [8]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])

In [5]:
def tokenize(raw, tokenizer="split"):
    if tokenizer == "spacy":
        return [token.text for token in nlp.tokenizer(raw)]
    if tokenizer == "split":
        return raw.split(" ")

# Hooks

## Pretokenization

### Removing links

In [6]:
def remove_links(raw):
    return re.sub(r'http.*\b', '[URL]', raw)

### Repairing some lost characters

In [9]:
def repaire_chars(raw):
    raw = re.sub(r'\\u2019', "\'", raw)
    raw = re.sub(r'\\u002c', ',', raw)
    raw = re.sub(r'&lt', '>', raw)
    raw = re.sub(r'&gt', '<', raw)        
    raw = re.sub(r'&amp;', '&', raw)
    raw = re.sub(r'\\\"\"', '', raw)
    raw = re.sub(r'\"\"\\', '', raw)
    return raw

### Removing usernames

In [10]:
def remove_usernames(raw):
    return re.sub(r'@[^\s]*', '[USER]', raw)

### Replacing unuseful emoticons

In [11]:
def replace_unuseful_emoticons(raw):
    for k,v in EMOTICONS.items():
        if k in useful_emoticons.keys() and k in raw:
            raw.replace(k,v)
    return raw

### Annotation and normalization (from github)

In [12]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [13]:
def annotation_normalization(raw):
    return " ".join(text_processor.pre_process_doc(raw))

### Remove punctation

In [14]:
import string
def remove_punctuation(raw):
    return raw.translate(str.maketrans('', '', string.punctuation))

### Spell check (from github)

In [15]:
import csv
import nltk
from ekphrasis.classes.spellcorrect import SpellCorrector
from nltk.corpus import words

djurdja = "/home/ikrizanic/pycharm/zavrsni/data/slang.csv"
local = "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/slang.csv"

with open(local, mode='r') as infile:
    reader = csv.reader(infile, delimiter=';')
    slang_dict = dict(reader)

sp = SpellCorrector(corpus="english")
nltk.download("words")
words = set(words.words())
punctuations = '''!()-[]{};:\'"\,<>./?@#$%^&*_~'''


Reading english - 1grams ...


[nltk_data] Downloading package words to /home/ivan/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [16]:
def spell_check(raw):
    correct_raw = list()
    for word in raw.split(" "):
        if word in words:
            correct_raw.append(word)
        else:
            correct_raw.append(sp.correct(word))
    return " ".join(correct_raw)

## Posttokenization

### Spell check on tokens

In [17]:
def spell_check_tokens(tokens):
    correct_tokens = list()
    for token in tokens:
        if token.strip()[0] == "<":
            continue
        if token.strip()[0] in punctuations:
            continue
        if token.strip() in words:
            correct_tokens.append(token.strip())
        else:
            correct_tokens.append(sp.correct(token.strip()))
    return correct_tokens

### Replace slang

In [18]:
def replace_slang(tokenized):
    tokens = []
    for token in tokenized:
        if token not in words:
            for key, value in slang_dict.items():
                if str(key).lower() == str(token).lower():
                    token = value.split(" ")
            if type(token) is list:
                tokens.extend(token)
            else:
                tokens.append(token)
        else:
            tokens.append(token)
    return tokens

### Removing stopwords

In [19]:
def remove_stopwords(tokens):
    stop_words_set = set(stopwords.words('english'))
    new_tokens = []
    for token in tokens:
        token_lower = token if token.islower() else token.lower()
        if token_lower not in stop_words_set:
            new_tokens.append(token)
    return new_tokens

### Lemmatizer

In [20]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

def lemmatize(tokens):
    lem = list()
    for token in tokens:
        lem.append(lemmatizer.lemmatize(token))
    return lem

### Build vocab

In [4]:
def build_vocab(data):
    vocab = dict()
    index = 1
    for sent in data:
        for word in sent:
            if word not in vocab.keys():
                vocab.update({word: index})
                index += 1     
    return vocab

### Encode sentence

In [5]:
def encode_sentence(sentence, vocab):
    encoded = list()
    for word in sentence:
        encoded.append(vocab[word])
    return encoded

In [6]:
def encode_data(data, vocab):
    encoded_data = list()
    for sent in data:
        encoded_data.append(encode_sentence(sent, vocab))
    return encoded_data

# Data loading

## Loading dataset

### New dataset

In [154]:
import copy
dataset_raw = list()

for i in tqdm(range(len(tweets) - 1)):
    new_tweet = repaire_chars(tweets[i+1])
    anot = copy.deepcopy(new_tweet)
    
    new_tweet = remove_usernames(new_tweet)
    new_tweet = remove_links(new_tweet)
    new_tweet = remove_punctuation(new_tweet)
    
    tweet_tokens = tokenize(new_tweet, tokenizer="spacy")
    tweet_tokens = remove_stopwords(tweet_tokens)
    
    anot = annotation_normalization(anot)
    anot_tokens = tokenize(anot, tokenizer="split")
    anot_tokens = spell_check_tokens(anot_tokens)
    anot_tokens = replace_slang(anot_tokens)
    anot_tokens = remove_stopwords(anot_tokens)
    anot_tokens = lemmatize(anot_tokens)
    
    
    
    dataset_raw.append({"tweet": new_tweet, "tweet_tokens": tweet_tokens, "anot": anot, "anot_tokens": anot_tokens})
#     dataset.append({"tweet": new_tweet, "anot_tokens": anot_tokens})
    

100%|██████████| 19/19 [00:00<00:00, 168.65it/s]


### Pickle

#### Load data

In [8]:
import pickle
# load_file = open("/home/ikrizanic/pycharm/zavrsni/data/dataset_dump.pkl", "rb")
load_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/dataset_dump.pickle", "rb")
dataset_raw = pickle.load(load_file)
load_file.close()

#### Dump data

In [None]:
file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/dataset_dump.pickle", "wb")
pickle.dump(dataset_raw, file)
file.close()

### Print data

In [6]:
for d in dataset_raw:
#     print(d["tweet"])
#     print(d["tweet_tokens"])
#     print(d["anot"])
    print(d["anot_tokens"])
#     print(" ".join(d["anot_tokens"]))
#     print("="*50)


['forgot', 'sad', 'first', 'episode', 'th', 'season', 'dexter', 'depressing', 'dexter', 'dark', 'passenger']
['oct', 'almost', 'far', 'away', 'wait', 'return', 'dexter']
['topic', 'episode', 'first', 'disk', 'dexter', 'please', 'hurry', 'fit', 'web', 'log']
['finally', 'catching', 'dexter', 'season', 'almost', 'finished', 'season', 'today', 'home', 'ill']
['downloaded', 'first', 'season', 'dexter', 'going', 'start', 'watching', 'episode', 'time', 'tweet', 'twit', 'twit']
['let', 'u', 'see', 'wrap', 'season', 'three', 'dexter', 'tonight', 'episode', 'go', 'till', 'season', 'finale', 'roll', 'series', 'four', 'please']
['retweet', 'st', 'rule', 'code', 'get', 'caught', 'dexter']
['word', 'tonight', 'night', 'dexter']
['honestly', 'say', 'dexter', 'good', 'show', 'may', 'order', 'showtime']
['feel', 'obliged', 'discus', 'dexter', 'tonight', 'living', 'large', 'brighton', 'please', 'use', 'direct', 'message']
['make', 'sure', 'girl', 'watch', 'season', 'dexter', 'season', 'premiere', 'seas

['spotted', 'saw', 'elvis', 'beginning', 'like', 'temporary', 'th', 'floor', 'office']
['got', 'rockdale', 'today', 'going', 'back', 'houston', 'tomorrow', 'thank', 'god']
['got', 'sweet', 'shout', 'jon', 'huntsman', 'jr', 'speech', 'st', 'louis', 'monday', 'night', 'well', 'done']
['pay', 'gate', 'saturday', 'bolton']
['china', 'open', 'cultural', 'centre', 'nepal', 'kathmandu', 'Sure', 'china', 'going', 'open', 'cultural', 'cen']
['jay', 'cutler', 'good', 'th', 'qtr', 'others', 'good', 'question', 'espn', 'could', 'worse', 'though', 'could', 'tony', 'romo']
['pacer', 'fan', 'going', 'fun', 'saturday']
['youtube', 'improves', 'upload', 'process', 'optional', 'notification', 'new', 'tag', 'editor', 'google', 'thursday', 'announced', 'th']
['ay', 'still', 'want', 'get', 'pair', 'ticket', 'sunderland', 'match', 'jan', 'ko']
['congratulation', 'scoring', 'first', 'goal', 'swansea', 'yesterday', 'may', 'say', 'look', 'exactly', 'like', 'jonas', 'gutierrez']
['lot', 'work', 'go', 'showing',

['wait', 'spring', 'breaker', 'coming', 'worldwide', 'release', 'date', 'u']
['da', 'bull', 'good', 'rose', 'get', 'back', 'figure', 'b', 'solid', 'th', 'seed', 'return']
['morris', 'hall', 'went', 'dumb', 'tha', 'whole', 'yr', 'memory']
['dtn', 'bangladesh', 'khaleda', 'fly', 'delhi', 'sunday', 'bnp', 'considers', 'visit', 'significant', 'taking', 'place']
['masami', 'si', 'moon', 'embrace', 'sun', 'love', 'rain', 'scent', 'woman', 'fashion', 'king', 'king', 'heart', 'queen', 'hyun', 'man']
['spotify', 'sean', 'parker', 'daniel', 'ek', 'make', 'music', 'noise', 'full', 'interview', 'video', 'th', 'thing', 'dig', 'hyper', 'text', 'transfer', 'protocol', 'bit', 'l']
['think', 'may', 'wasting', 'time', 'leave', 'alone', 'come', 'around', 'go', 'around', 'karm', 'bitch']
['watching', 'fashion', 'king', 'hmm', 'moon', 'embrace', 'sun', 'ito', 'nmn', 'ang', 'pagkakaabalahan', 'ko', 'hooked', 'n', 'ako', 'agad', 'sa', 'st', 'episode']
['love', 'west', 'leeds', 'festival', 'going', 'pudsey', 

['loveiseunhaehyuk', 'dunham', 'icn', 'nanjing', 'geez', 'glued', 'hip', 'whole', 'time']
['ah', 'bee', 'charmer', 'touch', 'wood', 'make', 'lp', 'next', 'friday', 'work', 'depending', 'x', 'x']
['pressor', 'cesc', 'difficult', 'arsenal', 'knew', 'going', 'play', 'every', 'saturday', 'captain', 'team', 'crack']
['hopefully', 'game', 'chardon', 'tomorrow', 'come', 'win', 'tiger']
['original', 'planned', 'release', 'date', 'blue', 'diamond', 'instead', 'nike', 'drop', 'two', 'new', 'bron', 'style', 'day', 'rose', 'launch']
['galaxy', 'available', 'select', 'store', 'weekend', 'metropcs', 'store', 'online', 'monday', 'thank', 'jg']
['haha', 'bucs', 'c', 'mon', 'wolf', 'way', 'better', 'love', 'rubio', 'ak', 'derrick', 'williams', 'b', 'roy', 'shved', 'good']
['late', 'october', 'best', 'picture', 'oscar', 'prediction', 'waiting', 'coverage', 'oscar', 'reached']
['mom', 'watching', 'trayvon', 'martin', 'trial', 'shaking', 'head', 'may', 'say', 'fucked', 'care']
['q', 'teaser', 'oops', 'for

['fun', 'day', 'terra', 'nova', 'followed', 'ho', 'end', 'night', 'got', 'plenty', 'screen', 'time', 'look', 'dancing', 'solder']
['today', 'found', 'rob', 'henry', 'tore', 'acl', 'steve', 'job', 'stepping', 'colt', 'signed', 'kerry', 'collins', 'carter', 'leaked']
['monday', 'night', 'football', 'gary', 'neville', 'well', 'even', 'time', 'like', 'train', 'spotter', 'talking', 'female', 'clitoris', 'easy']
['happy', 'birthday', 'hank', 'williams', 'honor', 'hank', 'turning', 'play', 'hank', 'song', 'row', 'tonite', 'honky', 'tonk']
['somebody', 'please', 'tell', 'say', 'mnf', 'preseason', 'game', 'thursday']
['tiger', 'know', 'big', 'game', 'well', 'resting', 'martinez', 'alburquerque', 'monday', 'bumping', 'verlander']
['new', 'cast', 'dwts', 'anounced', 'tonight', 'excited', 'meeting', 'tonight', 'better', 'done', 'someone', 'lend', 'phone', 'thank']
['recommend', 'turning', 'waiting', 'verlander', 'tomorrow']
['retweet', 'monday', 'monday', 'night', 'football', 'mind', 'retweet', 'l

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Data frame

In [9]:
pd.options.display.max_colwidth = 200
%matplotlib inline
corpus = np.array([d["anot_tokens"] for d in dataset_raw])
labels = np.array(labels)
corpus_df = pd.DataFrame({"text": corpus,
                         "label": labels})
corpus_df = corpus_df[['text', 'label']]

### Split data

In [10]:
from sklearn.model_selection import train_test_split
def split_train_validate_test(data, labels, train_valtest_ratio, validate_test_ratio, random_state=42):
    X_train, X_valtest, y_train, y_valtest = train_test_split(data, labels, test_size=train_valtest_ratio, random_state=random_state)
    X_validate, X_test, y_validate, y_test = train_test_split(X_valtest, y_valtest, test_size=validate_test_ratio, random_state=random_state)
    
    return X_train, X_validate, X_test, y_train, y_validate, y_test

### Create vocab and encode tweets

In [11]:
vocab = build_vocab([d["anot_tokens"] for d in dataset_raw])

In [12]:
encoded_data = encode_data([d["anot_tokens"] for d in dataset_raw], vocab)

# Feature extraction

## Pad encoded tweets

In [13]:
def pad_features(encoded, seq_length):
    features = np.zeros((len(encoded), seq_length), dtype = int)
    for i, review in enumerate(encoded):
        zeroes = list(np.zeros(seq_length - len(review)))
        new = zeroes + review        
        features[i,:] = np.array(new)
    
    return features

In [14]:
max_len = max(x for x in [len(d) for d in encoded_data])
features = pad_features(encoded_data, max_len)

In [15]:
x_train, x_validate, x_test, y_train, y_validate, y_test = split_train_validate_test(features, labels, 0.4, 0.5)

## Embed vocabulary

In [16]:
def embed_vocab(vocab, embedding_dim = 300):
    hits, misses = 0, 0
    embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))
    for word, i in vocab.items():
        token = nlp(word)
        if token.has_vector:
            embedding_matrix[i] = token.vector
            hits += 1
        else:
            misses += 1
    
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix

In [17]:
embedding_matrix = embed_vocab(vocab)

Converted 29252 words (0 misses)


In [None]:
load_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/embedding_matrix.pickle", "rb")
embedding_matrix = pickle.load(load_file)
load_file.close()

In [18]:
file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/embedding_matrix.pickle", "wb")
pickle.dump(embedding_matrix, file)
file.close()

### Spacy glove

In [26]:
# doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in corpus])

### Dump features

In [None]:
write_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/spacy_features.pickle", "wb")
pickle.dump(feature_array, write_file)
write_file.close()

### Load features

In [13]:
load_file = open("/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/spacy_features.pickle", "rb")
features = pickle.load(load_file)
load_file.close()

# Model

## Split features on train, validate and test

In [56]:
x_train, x_validate, x_test, y_train, y_validate, y_test = split_train_validate_test(features, labels, 0.3, 0.5)

In [69]:
import torch
from torch.utils.data import DataLoader, TensorDataset# create Tensor datasets

train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_validate), torch.from_numpy(y_validate))
test_data = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [71]:
# obtain one batch of training data
# dataiter = iter(train_loader)
# sample_x, sample_y = dataiter.next()
# print('Sample input size: ', sample_x.size()) # batch_size, seq_length
# print('Sample input: \n', sample_x)
# print()
# print('Sample label size: ', sample_y.size()) # batch_size
# print('Sample label: \n', sample_y)

## Model class

In [82]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
import tensorflow as tf
model = Sequential()
m = tf.keras.metrics.Recall()
# Embedding layer
model.add(
    Embedding(input_dim=len(vocab) + 1,
              input_length =28,
              output_dim=300,
              weights=[embedding_matrix],
              trainable=True,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(128, return_sequences=False, 
               dropout=0.01, recurrent_dropout=0.01))

# Fully connected layer
model.add(Dense(128, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss=tf.keras.losses.Hinge(reduction="auto", name="hinge"), metrics=[m])

In [23]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5), 
             ModelCheckpoint('../models/model.h5')]

## Training the network

In [55]:
print(len(x_test))
print(len(x_train))
print(len(x_validate))
print(len(y_test))
print(len(y_train))
print(len(y_validate))

5942
27724
5941
5942
27724
5941


In [57]:
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_validate = to_categorical(y_validate)

In [83]:
model.reset_states()
model.reset_metrics()
result = list()
for i in range(10):
    history = model.fit(x_train,  y_train, 
                    batch_size=2048, epochs=150,
                    callbacks=callbacks,
                    validation_data=(x_validate, y_validate))
    result.append(dict(zip(model.metrics_names, model.evaluate(x_test, y_test))))

Train on 27724 samples, validate on 5941 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150


In [84]:
result = model.evaluate(x_test, y_test)
dict(zip(model.metrics_names, result))



{'loss': 0.9197255349528384, 'recall_12': 0.6655492186546326}