# Brexit Polarity Tweets - Deep Learning

## Setup

In [1]:
import re
import os
import string
import pickle

# Data Manipulation and Visualization
import numpy as np
import pandas as pd

# NLP Tools
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# ML Tools
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import LabelEncoder

# TensorFlow
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D
from tensorflow.keras.layers import Bidirectional, LSTM, GRU
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout

In [2]:
# path to data
PATH_DATA         = "./data/"
PATH_TWEETS_TOKEN = PATH_DATA + "preprocessed/tweets_token.csv"
PATH_TARGETS      = PATH_DATA + "preprocessed/targets.csv"
PATH_TWEETS_RAW   = PATH_DATA + "preprocessed/tweets_raw.csv"
PATH_GLOVE        = PATH_DATA + 'embeddings/glove.6B.100d.txt'

# paths to model checkpoints
PATH_MODEL      = "./model/"
PATH_HISTORY    = PATH_MODEL + "history/history.pkl"
PATH_CHECKPOINT = PATH_MODEL + "checkpoint/cp-{epoch:02d}.ckpt"

DIRNAME_CHECKPOINT = os.path.dirname(PATH_CHECKPOINT)

# settings
N_WORDS        = 30_000
N_ROWS         = 100_000
EMBEDDING_DIMS = 100
TEST_SPLIT     = 0.10
VAL_SPLIT      = 0.10 / 0.90

In [3]:
label_encoder = LabelEncoder()

# 2. Data Preparation

In [4]:
from ast import literal_eval

tweet_tokens = pd.read_csv(PATH_TWEETS_TOKEN, index_col= False)["Hit Sentence"]
tweet_tokens = tweet_tokens.map(literal_eval)

targets = pd.read_csv(PATH_TARGETS)["0"]

In [5]:
# generate indices to split dataset
sss = ShuffleSplit(n_splits = 1, test_size = TEST_SPLIT, random_state = 123)
sss.get_n_splits(tweet_tokens, targets)

train_index, test_index = next(sss.split(tweet_tokens, targets))

print(f"There are {len(train_index)} tweets in the train dataset.")
print(f"There are {len(test_index)} tweets in the test dataset.")

There are 89559 tweets in the train dataset.
There are 9951 tweets in the test dataset.


In [6]:
# generate indices to split dataset
sss = ShuffleSplit(n_splits = 1, test_size = VAL_SPLIT, random_state = 123)
sss.get_n_splits(tweet_tokens[train_index], targets[train_index])

part_train_index, val_index = next(sss.split(tweet_tokens[train_index], targets[train_index]))

print(f"There are {len(part_train_index)} tweets in the train dataset.")
print(f"There are {len(val_index)} tweets in the test dataset.")

There are 79608 tweets in the train dataset.
There are 9951 tweets in the test dataset.


In [7]:
def process_tweet(text):
    output = []
    lemmatizer = nltk.WordNetLemmatizer()
    tokenizer  = nltk.WhitespaceTokenizer()
    unwanted_tokens = nltk.corpus.stopwords.words("english")
    
    patterns = [
        r"(?:RT|QT):? ?@[\w]+:?",
        r"https?://\S+",
        r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});",
        r"[$|£]?.?[0-9]+(?:,?[0-9]{3})*(?:\.[0-9]*)*%?(?:st|nd|rd|th)?",
        r"[0-9]+/[0-9]+",
        u"\U0001F600-\U0001F64F",  # emoticons
        u"\U0001F300-\U0001F5FF",  # symbols & pictographs
        u"\U0001F680-\U0001F6FF",  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF",  # flags (iOS)
        u"\U00002500-\U00002BEF",  # chinese char
        u"\U00002702-\U000027B0",
        u"\U000024C2-\U0001F251",
        u"\U0001f926-\U0001f937",
        u"\U00010000-\U0010ffff",
        u"\u2640-\u2642",
        u"\u2600-\u2B55",
        u"\u200d",
        u"\u23cf",
        u"\u23e9",
        u"\u231a",
        u"\ufe0f",  # dingbats
        u"\u3030",
        "[" + string.punctuation + "]"
    ]
    
    pattern = "(" + "|".join(patterns)+ ")"
    
    text = re.sub(pattern, "", text)
    text = re.sub(u"[\u2018|\u2019]", "'", text)
    text = re.sub(u"[\u201c|\u201d]", "\"", text)
    
    for token in tokenizer.tokenize(text.lower()):
        if token in unwanted_tokens: continue
        token = lemmatizer.lemmatize(token)
        
        if len(token) != 0:
            output.append(token)

    return " ".join(output)

In [8]:
tweet_raw = pd.read_csv(PATH_TWEETS_RAW, index_col= False)["Hit Sentence"]
tweet_tokens = tweet_raw.map(process_tweet)
tweet_tokens

0        value brexit tory mp back constituency weekend...
1        michael fabricant absolutely right dangerous e...
2        stokiedre find rich tory voter assume unionist...
3        brexit britain win london named best city youn...
4        afneil johnson taken tory likelihood imposed p...
                               ...                        
99505    mikegalsworthy brexiteers never interested fac...
99506    brexit disaster often obscured covid dover lor...
99507    trying shift blame yet position caused greedy ...
99508      remember rees mogg said brexit mean lower price
99509    reporting observer guardian journalist carole ...
Name: Hit Sentence, Length: 99510, dtype: object

In [9]:
fd = nltk.FreqDist()
for token_list in tweet_tokens:
    for token in token_list.split(" "):
        fd[token] += 1
        
fd.most_common(5)

[('brexit', 57016),
 ('eu', 35752),
 ('uk', 19254),
 ('boris', 13110),
 ('people', 10829)]

In [10]:
vocab_size = 10000
len_oov    = 1
len_seq    = 100

vocab   = [token for token, count in fd.most_common(vocab_size)]
values  = range(2, len(vocab) + 2)

init = tf.lookup.KeyValueTensorInitializer(vocab,
                                           values,
                                           key_dtype = tf.string,
                                           value_dtype = tf.int64)

vocab_table = tf.lookup.StaticVocabularyTable(init, len_oov)

In [19]:
preprocess_layer = tf.keras.layers.TextVectorization(
    max_tokens  = vocab_size + 2,
    output_mode = 'int',
    vocabulary  = vocab,
    output_sequence_length = len_seq)

In [25]:
embedding_dim = 100
vocab_size = len(vocab) + 2
model = tf.keras.models.Sequential([
    Embedding(vocab_size, embedding_dim, input_shape = (100,), name="embedding"),
    tf.keras.layers.GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation = "sigmoid")
])

In [32]:
X_train = preprocess_layer(tweet_tokens[train_index])
X_val   = preprocess_layer(tweet_tokens[val_index])
X_test  = preprocess_layer(tweet_tokens[test_index])

In [33]:
targets = label_encoder.fit_transform(targets)
y_train = targets[train_index]
y_test  = targets[test_index]
y_val   = targets[val_index]

In [28]:
model.compile(
    optimizer='adam',
    loss="BinaryCrossentropy",
    metrics=['accuracy'])

In [30]:
model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19c8bf8b790>

In [36]:
model.evaluate(X_test, y_test)



[0.3077773451805115, 0.8881519436836243]