In [10]:
import re
import nltk
import string

# Data Manipulation and Visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NLP Tools
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.util import mark_negation

# ML Tools
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import LabelEncoder

In [11]:
# specify some NLP-related functions 
label_encoder = LabelEncoder()
tokenizer     = TweetTokenizer()
lemmatizer    = WordNetLemmatizer()

# specify unwanted tokens to be filtered in cleaning process
def get_stopwords():
    stopwords = [w.lower() for w in nltk.corpus.stopwords.words('english')]
    lemmatized_stopwords = [lemmatizer.lemmatize(word) for word in stopwords]
    return stopwords + lemmatized_stopwords

def get_punctuations():
    no_punctuations_stopwords = [re.sub(f"[{string.punctuation}]", "", word) for word in stopwords]
    custom = ["…", "...", "..", "\xad", "–"]
    punctuations = list(string.punctuation)
    return no_punctuations_stopwords + custom + punctuations

stopwords = get_stopwords()
punctuations = get_punctuations()

In [12]:
# settings
N_ROWS     = None # `None` to import all rows
TEST_SPLIT = 0.2

# paths to data
PATH_DATA  = "./data/"
PATH_TRAIN = PATH_DATA + "preprocessed/train/"
PATH_TEST  = PATH_DATA + "preprocessed/test/"
PATH_ANTI  = PATH_DATA + 'raw/TweetDataset_AntiBrexit_Jan-Mar2022.csv'
PATH_PRO   = PATH_DATA + 'raw/TweetDataset_ProBrexit_Jan-Mar2022.csv'

In [13]:
# import both pro and anti tweets data
if N_ROWS is not None:
    nrows = N_ROWS // 2
else:
    nrows = None

pro  = pd.read_csv(PATH_PRO, nrows = nrows)["Hit Sentence"].drop_duplicates()
anti = pd.read_csv(PATH_ANTI, nrows = nrows)["Hit Sentence"].drop_duplicates()

# combine all tweets data and create targets variable
tweets = pd.concat([pro, anti]).reset_index(drop = True)
targets = pd.Series(np.repeat(["Pro", "Anti"], [len(pro), len(anti)]))

# count the total number of tweets
tweets_length = len(pro) + len(anti)

print(f"There are {len(pro)} tweets from users who support Brexit")
print(f"There are {len(anti)} tweets from users who doesn't support Brexit")

# delete unused variables
del pro, anti

There are 94652 tweets from users who support Brexit
There are 102442 tweets from users who doesn't support Brexit


In [14]:
def clean_tweet(tweet):
    """Clean all tweets from RT, QT, URLs, HTML Tags, numbers and statistics, emojis, and unicode punctuation marks"""
    
    # Regex for removing "RT" or "QT"
    qt_rt = re.compile(r'(RT|QT)? ?@[\w]+:?')

    # Regex for removing URL
    url = re.compile(r'https?://\S+')

    # Regex for removing HTML tags
    # source of regex: https://stackoverflow.com/a/12982689
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

    # Regex for removing numbers and statistics
    stats = re.compile("[$|£]?.?[0-9]+(?:,?[0-9]{3})*(?:\.[0-9]*)*%?(?:st|nd|rd|th)?")

    # Regex for removing emojis
    # Source of regex: https://stackoverflow.com/a/58356570
    emojis = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", flags = re.UNICODE)

    # Regex for replacing unicode quotation marks into regular one
    single_quotation = re.compile(u"[\u2018|\u2019]", flags = re.UNICODE)
    double_quotation = re.compile(u"[\u201c|\u201d]", flags = re.UNICODE)
    
    tweet = re.sub(qt_rt, "", tweet)
    tweet = re.sub(url, "", tweet)
    tweet = re.sub(html, "", tweet)
    tweet = re.sub(stats, "", tweet)
    tweet = re.sub(emojis, "", tweet)
    tweet = re.sub(single_quotation, "'", tweet)
    tweet = re.sub(double_quotation, "\"", tweet)
    tweet = re.sub(" +", " ", tweet).strip()
    
    tokens = tokenizer.tokenize(tweet)
    tokens = [t.lower() for t in tokens if t.lower() not in punctuations]
    
    return " ".join(tokens)

def remove_stopwords(tweet):
    return " ".join([t.lower() for t in tokenizer.tokenize(tweet) if t.lower() not in stopwords])

def lemmatize(tweet):
    return " ".join([lemmatizer.lemmatize(t) for t in tokenizer.tokenize(tweet)])

def mark_negation(tweet):
    mark = nltk.sentiment.util.mark_negation
    return " ".join(mark([t for t in tokenizer.tokenize(tweet)]))

In [15]:
tweets = tweets.apply(clean_tweet)
tweets_nostw = tweets.apply(remove_stopwords)
tweets_lemma = tweets.apply(lemmatize)
tweets_negat = tweets.apply(mark_negation)
tweets_nostw_lemma = tweets_nostw.apply(lemmatize)
tweets_lemma_negat = tweets_lemma.apply(mark_negation)
tweets_nostw_negat = tweets_nostw.apply(mark_negation)
tweets_nostw_lemma_negat = tweets_nostw_lemma.apply(mark_negation)

In [16]:
non_empty_tweets = tweets_nostw_lemma_negat.map(len) > 0

targets = targets[non_empty_tweets].reset_index(drop = True)
tweets = tweets[non_empty_tweets].reset_index(drop = True)
tweets_nostw = tweets_nostw[non_empty_tweets].reset_index(drop = True)
tweets_lemma = tweets_lemma[non_empty_tweets].reset_index(drop = True)
tweets_negat = tweets_negat[non_empty_tweets].reset_index(drop = True)
tweets_nostw_lemma = tweets_nostw_lemma[non_empty_tweets].reset_index(drop = True)
tweets_lemma_negat = tweets_lemma_negat[non_empty_tweets].reset_index(drop = True)
tweets_nostw_negat = tweets_nostw_negat[non_empty_tweets].reset_index(drop = True)
tweets_nostw_lemma_negat = tweets_nostw_lemma_negat[non_empty_tweets].reset_index(drop = True)

# count the total number of tweets left
print(f"Deleted {tweets_length - len(tweets)} out of {tweets_length} tweets.", end = " ")
print(f"There are {len(tweets)} tweets left.")
tweets_length = len(tweets)

Deleted 1087 out of 197094 tweets. There are 196007 tweets left.


In [17]:
# generate indices to split dataset
sss = ShuffleSplit(n_splits = 1, test_size = TEST_SPLIT, random_state = 123)
sss.get_n_splits(tweets, targets)

train_index, test_index = next(sss.split(tweets, targets))

print(f"There are {len(train_index)} tweets in the train dataset.")
print(f"There are {len(test_index)} tweets in the test dataset.")

There are 156805 tweets in the train dataset.
There are 39202 tweets in the test dataset.


In [18]:
def save_tweets(tweets, filepath):
    with open(filepath, "w") as f:
        for tweet in tweets:
            f.write(tweet + "\n")

save_tweets(targets[train_index], PATH_TRAIN + "0-targets.txt")
save_tweets(tweets[train_index], PATH_TRAIN + "0-clean.txt")
save_tweets(tweets_nostw[train_index], PATH_TRAIN + "1-clean-nostw.txt")
save_tweets(tweets_lemma[train_index], PATH_TRAIN + "1-clean-lemma.txt")
save_tweets(tweets_negat[train_index], PATH_TRAIN + "1-clean-negat.txt")
save_tweets(tweets_nostw_lemma[train_index], PATH_TRAIN + "2-clean-nostw-lemma.txt")
save_tweets(tweets_lemma_negat[train_index], PATH_TRAIN + "2-clean-lemma-negat.txt")
save_tweets(tweets_nostw_negat[train_index], PATH_TRAIN + "2-clean-nostw-negat.txt")
save_tweets(tweets_nostw_lemma_negat[train_index], PATH_TRAIN + "3-clean-nostw-lemma-negat.txt")

save_tweets(targets[test_index], PATH_TEST + "0-targets.txt")
save_tweets(tweets[test_index], PATH_TEST + "0-clean.txt")
save_tweets(tweets_nostw[test_index], PATH_TEST + "1-clean-nostw.txt")
save_tweets(tweets_lemma[test_index], PATH_TEST + "1-clean-lemma.txt")
save_tweets(tweets_negat[test_index], PATH_TEST + "1-clean-negat.txt")
save_tweets(tweets_nostw_lemma[test_index], PATH_TEST + "2-clean-nostw-lemma.txt")
save_tweets(tweets_lemma_negat[test_index], PATH_TEST + "2-clean-lemma-negat.txt")
save_tweets(tweets_nostw_negat[test_index], PATH_TEST + "2-clean-nostw-negat.txt")
save_tweets(tweets_nostw_lemma_negat[test_index], PATH_TEST + "3-clean-nostw-lemma-negat.txt")