Import libraries for preprocessing & download necessary nltk "packages" ( I don't know how else to call them )

In [30]:
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import *
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

nltk.download("punkt")
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Define function to be applied to the dataframe for preprocessing.
We go through the usual flow of analyzing the text, tokenizing it and normalizing it in this function.

In [31]:
def tokenize(full_string, stop_words, normalizer):
    # I removed all non-ascii characters because the dataset had a lot of character which were probably emojis
    # but were not translated properly. They would not have brought any sort of benefits.
    ascii_full_string = full_string.encode("ascii", "ignore").decode()
    # Tokenization with punkt
    tokenized_string = wordpunct_tokenize(ascii_full_string)
    # Removal of stop words which would not bring any benefit
    tokens_without_stopwords = [word for word in tokenized_string if word not in stop_words]
    # Depending on the normalization type the preprocessing part either uses a stemmer or a lemmatizer
    if type(normalizer) == nltk.stem.porter.PorterStemmer:
        singles = [normalizer.stem(token) for token in tokens_without_stopwords]
    if type(normalizer) == nltk.stem.wordnet.WordNetLemmatizer:
        singles = [normalizer.lemmatize(token) for token in tokens_without_stopwords]
    # Join the string for further processing
    final_string = (' ').join(singles)
    return final_string

Read & preprocess the data.
The first step is to read the csv from a local file. 
After that, during the data exploration part of the project, I noticed that there is a lack of balance in terms of label representation. ( ~29300 labels characterized as not racist - 2100 labels characetized as racist )
Despite this I still trained the model with this initial dataframe, but the results were very skewed towards the label which was better represented (obviously).
This is why I chose to pick the same number of records from label 0 as the ones from label 1.

In [32]:
df = pd.read_csv(".//train.csv", header = 'infer', sep = ",", index_col = "id")
class_1_df = df[df["label"] == 1]
class_0_df = df[df["label"] == 0].sample(frac = 1)[ : len(class_1_df)]
df = pd.concat([class_0_df, class_1_df]).sample(frac = 1)

class_col = df.pop("label")

Define global variables for future uses.

In [33]:
stop_words = stopwords.words("english")
normalizer_hash = {
    "stemmer": PorterStemmer(),
    "lemmatizer": WordNetLemmatizer()
    }
vectorizer_hash = {
    "tfidf": TfidfVectorizer(),
    "count": CountVectorizer()
    }

Use the tokenize() function and then go forward in the flow by vectorizing the manipulated strings and selecting the best features for the job.

In [34]:
def preprocess_dataframe(df_input, normalizer, vectorizer):
    df = df_input.copy()
    # Initial tokenization step
    df["tweet"] = [tokenize(row, stop_words, normalizer_hash[normalizer]) for row in df["tweet"]]

    # Initial vectorizer result
    x = vectorizer_hash[vectorizer].fit_transform(df["tweet"])
    # Choosing the best 50 values
    ch2 = SelectKBest(chi2, k = 50)
    ch2.fit(x, class_col)
    # Mask containing the features I should be using in the model
    mask = ch2.get_support()

    # Going from text to a list of numbers
    df["tweet"] = list(x.toarray())
    # Choosing what specific numbers to retain from that list
    df["tweet"] = [ row[mask] for row in df["tweet"] ]
    # Create new feature names
    column_names = ["feature" + str(i) for i in range(len(df["tweet"].iloc[1]))]
    # Go from df with a column containing a list to a df with multiple features.
    df_preprocessed = pd.DataFrame(df.tweet.to_list(), columns = column_names, index = df.index)

    # Split the data for testing
    X_train, X_test, y_train, y_test = train_test_split(df_preprocessed, class_col, test_size=0.2, random_state=9, stratify=class_col)
    return [X_train, X_test, y_train, y_test]

Choose one of the four preprocessing setups from below and run the respective cell.

Preprocess data with Stemming and CountVectorizer

In [36]:
X_train, X_test, y_train, y_test = preprocess_dataframe(df, "stemmer", "count")

Preprocess data with Stemming and TF-IDF

In [None]:
X_train, X_test, y_train, y_test = preprocess_dataframe(df, "stemmer", "tfidf")

Preprocess data with Lemmatizer and CountVectorizer

In [None]:
X_train, X_test, y_train, y_test = preprocess_dataframe(df, "lemmatizer", "count")

Preprocess data with Lemmatizer and TF-IDF

In [None]:
X_train, X_test, y_train, y_test = preprocess_dataframe(df, "lemmatizer", "tfidf")

Define model evaluation metrics & import necessary libraries.

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB

def metrics(y_test, y_pred, alg):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)    
    sensitivity = tp / (tp + fn)
    accuracy = (tp+tn) / (tp + fp + tn + fn)
    print("============== {} ==============".format(alg))
    print("Accuracy has a value of {}".format(accuracy))
    print("Specificity has a value of {}".format(specificity))
    print("Sensitivity has a value of {}".format(sensitivity))
    print( pd.DataFrame(
        confusion_matrix(y_test, y_pred), 
        columns = ['Prediction 0', 'Prediction 1'],
        index = ['True 0', 'True 1']))
    print(" ")

Define a simple SVM model.
Given that SVC automatically gets an rbf kernel, other input parameters except C are not applicable.
Therefore, I chose to go with a crossvalidation-trained model with a grid search over parameter C.

In [38]:
svm = SVC(gamma = 'auto', probability = True)
parameters = {
            'C': [5, 10, 15]
}
clf = GridSearchCV(svm, parameters, cv=5)
clf.fit(X_train, y_train)
y_pred_svm = clf.predict(X_test)

metrics(y_test, y_pred_svm, "svm")

Accuracy has a value of 0.7447045707915273
Specificity has a value of 0.8530066815144766
Sensitivity has a value of 0.6361607142857143
        Prediction 0  Prediction 1
True 0           383            66
True 1           163           285
 


Define a simple Naive Bayes model.

In [39]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_nb = gnb.predict(X_test)

metrics(y_test, y_pred_nb, "naive bayes")

Accuracy has a value of 0.6867335562987736
Specificity has a value of 0.42538975501113585
Sensitivity has a value of 0.9486607142857143
        Prediction 0  Prediction 1
True 0           191           258
True 1            23           425
 


Import necessary libraries for a simple MLP model.

In [40]:
from torch.nn import ReLU
from torch.nn import Module
from torch.nn import Sigmoid
from torch.nn import Linear
from torch import Tensor
from torch.utils.data import *
from torch import LongTensor
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn.init import xavier_uniform_
import numpy as np

Define the neural network class.
Given that MLPs can also be used for classification I chose to also try this model just as an experiment.
It's created of 3 hidden layers, all of their weights being initialized with a Xavier Uniform initialization.
The 3 hidden layers have 25, 25 and 10 neurons each, with a ReLU activation function.
The input layer has 50 neurons (given that I chose to use 50 features).
The output layer has a single neuron computed using a sigmoid activation function. Then the result is rounded to either 0 or 1.

In [41]:
class MLP(Module):
    def __init__(self):
        super().__init__()
        self.learning_rate = 1e-3
        self.hidden_layer1 = Linear(50, 25)
        xavier_uniform_(self.hidden_layer1.weight)
        self.act1 = ReLU()
        self.hidden_layer2 = Linear(25, 25)
        xavier_uniform_(self.hidden_layer2.weight)
        self.act2 = ReLU()
        self.hidden_layer3 = Linear(25, 10)
        xavier_uniform_(self.hidden_layer3.weight)
        self.act3 = ReLU()
        self.hidden_layer4 = Linear(10, 1)
        xavier_uniform_(self.hidden_layer3.weight)
        self.act4 = Sigmoid()
        
    def forward(self, data):
        data = self.hidden_layer1(data)
        data = self.act1(data)
        data = self.hidden_layer2(data)
        data = self.act2(data)
        data = self.hidden_layer3(data)
        data = self.act3(data)
        data = self.hidden_layer4(data)
        data = self.act4(data)
        return data

Reshape input data, create model, train model, predict the results and evaluate them.

In [42]:
train = TensorDataset(Tensor(np.array(X_train)), LongTensor(np.array(y_train)))
train_dl = DataLoader(train, batch_size = 16)

mlp = MLP()    
optimizer = SGD(mlp.parameters(), lr = 1e-3)
criterion = BCELoss()

# training
for epoch in range(100):
    for i, data in enumerate(train_dl, 0):
        inputs, targets = data
        optimizer.zero_grad()
        y_pred = mlp(inputs)
        targets = targets.unsqueeze(1)
        targets = targets.float()
        loss = criterion(y_pred, targets)
        loss.backward()
        optimizer.step()

# ugly testing
y_pred_mlp = []
for _, item in X_test.iterrows():
    if type(item) == str:
        continue
    y_pred_mlp.append(np.round(mlp(Tensor(np.array(item)))[0].item()))

metrics(y_test, y_pred_mlp, "mlp")

Accuracy has a value of 0.633221850613155
Specificity has a value of 0.5924276169265034
Sensitivity has a value of 0.6741071428571429
        Prediction 0  Prediction 1
True 0           266           183
True 1           146           302
 
