# Brexit Polarity Tweets - Text Classification

Reference for feature selection and dimensionality reduction: https://arxiv.org/pdf/1905.02845.pdf

## Setup

In [1]:
# Data Manipulation and Visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.sentiment import SentimentIntensityAnalyzer

# ML Tools
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import recall_score, roc_auc_score

# ML Models
from xgboost import XGBClassifier

import tensorflow as tf
from scikeras.wrappers import KerasClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# settings
N_ROWS     = 1_000 # `None` to import all rows
TEST_SPLIT = 0.2

# paths to extracted features
PATH_PREPROCESSED_TRAIN = "./data/preprocessed/train/"
PATH_PREPROCESSED_TEST = "./data/preprocessed/test/"
PATH_FEATURE_TRAIN = "./data/features/train/"
PATH_FEATURE_TEST = "./data/features/test/"

In [3]:
label_encoder = LabelEncoder()
sia = SentimentIntensityAnalyzer()

In [4]:
def read_tweet(filepath):    
    tweets = []
    
    with open(filepath, "r") as f:
        for tweet in f:
            tweets.append(tweet.replace("\n", ""))
    
    return pd.Series(tweets)

## Baseline Model

### Sentiment score-based

In [8]:
tweets = read_tweet(PATH_PREPROCESSED_TEST + "0-clean.txt")
targets = read_tweet(PATH_PREPROCESSED_TEST + "0-targets.txt")

polarity = []
for tweet in tweets:
    polarity.append(sia.polarity_scores(tweet)["compound"])
polarity = pd.Series(polarity) > 0.5

# evaluate baseline model
y_pred = polarity.apply(lambda x: "Pro" if x else "Anti")
accuracy_score(targets, y_pred)

0.5138003163103924

### Random Forest using Sentiment Score

In [6]:
# fit baseline model
tweets = read_tweet(PATH_PREPROCESSED_TRAIN + "0-clean.txt")
targets = read_tweet(PATH_PREPROCESSED_TRAIN + "0-targets.txt")

polarity = []
for tweet in tweets:
    polarity.append(sia.polarity_scores(tweet))
polarity = pd.DataFrame(polarity)

baseline = RandomForestClassifier()
baseline.fit(polarity, targets)


# evaluate baseline model
tweets = read_tweet(PATH_PREPROCESSED_TEST + "0-clean.txt")
targets = read_tweet(PATH_PREPROCESSED_TEST + "0-targets.txt")

polarity = []
for tweet in tweets:
    polarity.append(sia.polarity_scores(tweet))
polarity = pd.DataFrame(polarity)

y_pred = baseline.predict(polarity)
accuracy_score(targets, y_pred)

0.5292332023876333

### Feature-based

In [None]:
tweets = pd.read_csv(PATH_FEATURE_TRAIN + "1-clean-nostw.csv")
targets = read_tweet(PATH_PREPROCESSED_TRAIN + "0-targets.txt")

# fit baseline model
baseline = RandomForestClassifier()
baseline.fit(tweets, targets)

In [None]:
tweets = pd.read_csv(PATH_FEATURE_TEST + "1-clean-nostw.csv")
targets = read_tweet(PATH_PREPROCESSED_TEST + "0-targets.txt")

# evaluate baseline model
y_pred = baseline.predict(tweets)
accuracy_score(targets, y_pred)

## Feature Selection

## Model Selection

In [None]:
models = [
    KNeighborsClassifier(),
    LogisticRegression(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    XGBClassifier()
]

In [None]:
# a dictionary to store each algorithm performance
cv_scores = {}

for model in models:
    score = cross_val_score(model, X_train, y_train, cv = 5, n_jobs = -1)
    cv_scores[model.__class__.__name__] = np.mean(score)

In [None]:
model_names = np.array(list(cv_scores.keys()))
model_scores = np.array([np.mean(scores) for scores in cv_scores.values()])

In [None]:
model_scores_percent = list(map(lambda x: f"{x*100:.2f} %", model_scores))

pd.DataFrame({"Model Name": model_names, "Score": model_scores_percent}) \
    .sort_values(by = 'Score', ascending = False) \
    .reset_index(drop = True)

In [None]:
sorting_index = np.argsort(model_scores)

plt.barh(y = model_names[sorting_index], width = model_scores[sorting_index])
plt.show()

## Hyperparameter Tuning

## Model Evaluation