In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Imports**

In [10]:
!pip install num2words
!pip install nltk

import re
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from num2words import num2words
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


from transformers import AutoTokenizer
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, confusion_matrix
from transformers import AutoModelForSequenceClassification



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
data = pd.read_csv('/content/drive/My Drive/Bachelorprojekt/val_data_no_stopwords.csv')

# **Text-preprocessing**

The `preprocessing` function takes a DataFrame as input, which must contain at least two columns: `labels` (for calculating scores) and `tweet` (containing raw text data). It processes the text by cleaning unwanted content, removing stopwords, and converting the cleaned data into a new column called `no stopwords`. Additionally, it calculates a `score` and a `class` column based on the `labels`. The function outputs the processed DataFrame, ready for further analysis or model training.

In [12]:
def Data_pre_processing(df):

    # Calculate the score
    def calc_score(l):
        counter = 0
        for i in l:
            if i == 'Y':
                counter += 1
        return counter

    # Calculate the score and add the 'score' column
    df["score"] = df["labels"].apply(calc_score)

    # Classification based on the score
    df["class"] = df["score"] >= 3
    df["class"] = df["class"].astype(int)  # Ensure 'class' is stored as an integer

    # Regex patterns for removing unwanted content
    regex_link = r"http(s?)://(([a-z]|[A-Z]|\d)+\.)+([a-z]|[A-Z]|\d)+/?(([a-z]|[A-Z]|\d)+/?)*"
    regex_email = r"([a-z]|[A-Z]|\d)+\.([a-z]|[A-Z]|\d)+\@([a-z]|[A-Z]|\d)+\.([a-z]|[A-Z]|\d)+"
    regex_tag = r"@([a-z]|[A-Z]|_|\d)+"

    # Function to remove unwanted content
    def remove_unwanted(row):
        cleaned_text = row["tweet"].lower()
        cleaned_text = re.sub(regex_link, "", cleaned_text)
        cleaned_text = re.sub(regex_email, "", cleaned_text)
        cleaned_text = re.sub(regex_tag, "", cleaned_text)
        cleaned_text = re.sub(r",", " ", cleaned_text)
        cleaned_text = re.sub(r":", " ", cleaned_text)
        cleaned_text = re.sub(r"\.", " ", cleaned_text)
        cleaned_text = re.sub(r"(ain\'t)|(ain\’t)", " is not", cleaned_text)
        cleaned_text = re.sub(r"(can\'t)|(can\’t)|(cant)", " can not", cleaned_text)
        cleaned_text = re.sub(r"(n\'t)|(n\’t)", " not", cleaned_text)
        cleaned_text = re.sub(r"(\'s)|(\’s)", " is", cleaned_text)
        cleaned_text = re.sub(r"(\'d)|(\’d)", " would", cleaned_text)
        cleaned_text = re.sub(r"(\'m)|(\’m)", " am", cleaned_text)
        cleaned_text = re.sub(r"(\'re)|(\’re)", " are", cleaned_text)
        cleaned_text = re.sub(r"(\'ve)|(\’ve)", " have", cleaned_text)
        cleaned_text = re.sub(r"(\'ll)|(\’ll)", " will", cleaned_text)
        cleaned_text = re.sub(r"&amp;", " and ", cleaned_text)
        cleaned_text = re.sub(r"&gt;", " greater than ", cleaned_text)
        cleaned_text = re.sub(r"&lt;", " less than ", cleaned_text)
        cleaned_text = re.sub(r"\su\s", " you ", cleaned_text)
        cleaned_text = re.sub(r"\'|\||\’|\‘", "", cleaned_text)
        cleaned_text = re.sub(r"\s\s+", " ", cleaned_text)
        cleaned_text = re.sub(r"\"", "", cleaned_text)

        # Remove non-ASCII characters (like emojis)
        cleaned_text = re.sub(r'[^\x00-\x7F]+', '', cleaned_text)

        # Convert numbers to words
        cleaned_text = re.sub(r"\d\d\d\d", "", cleaned_text)  # Remove 4-digit numbers
        cleaned_text = re.sub(r"\d+", lambda x: num2words(x.group()), cleaned_text)

        # Keep only allowed characters
        cleaned_text = re.sub(r"[^a-z|\!|\#|\_|\?|\s|\-]", "", cleaned_text)

        return cleaned_text

    # Apply the function to clean the text
    df["clean"] = df.apply(remove_unwanted, axis=1)

    # Initialize lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    STOPWORDS = set(stopwords.words('english')) - {"but", "nor", "not", "against"}

    # Function to remove stopwords
    def remove_stopwords(text):
        tweet = text.split()  # Split text into a list of words
        clean = [lemmatizer.lemmatize(word) for word in tweet if word not in STOPWORDS]
        return " ".join(clean)

    # Apply the function to remove stopwords
    df["no stopwords"] = df["clean"].apply(remove_stopwords)

    # Drop rows where 'no stopwords' is NaN
    df = df.dropna(subset=['no stopwords'])

    return df

# **Text Encoding**

This method processes a DataFrame to prepare it for input into a trained model by converting preprocessed texts into numerical tokens. The method requires a DataFrame containing a column named 'no stopwords', which holds the cleaned and preprocessed text data, and a tokenizer compatible with the trained model. It extracts the text from the 'no stopwords' column and uses the tokenizer to generate tokenized representations, including Input IDs and Attention Masks. The output is a dictionary containing the tokenized data, ready for use in the model.

In [13]:
def Text_encoding(df, tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-hate')):

    # Extract the preprocessed texts
    texts = df["no stopwords"].tolist()

    texts = [str(text) for text in texts]

    # Tokenize the texts
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"  # Returns PyTorch tensors
    )

    return encodings

# **Helper function**

In [None]:
def Helper_function():
    pass

# **Model Prediction**

This method performs predictions using the trained model and calculates the accuracy and confusion matrix for the given set of tokenized data. It requires a trained model (e.g., a Transformer model), a dictionary containing tokenized input data (e.g., 'input_ids' and 'attention_mask'), and the true labels for evaluation. An optional classification threshold (default is 0.5 for binary classification) can be provided. The method returns a dictionary containing the accuracy and confusion matrix.

In [14]:
def model_prediction(model, encodings, labels, threshold=0.5):

    # Set the model to evaluation mode
    model.eval()

    # Ensure input data is moved to the correct device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Perform predictions
    with torch.no_grad():
        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)

        # Get model outputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Calculate probabilities
        probabilities = F.softmax(logits, dim=1)  # Softmax returns probabilities for each class
        batch_predictions = (probabilities[:, 1] > threshold).int()  # Apply threshold (for binary classification)

    # Calculate accuracy and confusion matrix
    accuracy = accuracy_score(labels, batch_predictions.cpu().numpy())
    conf_matrix = confusion_matrix(labels, batch_predictions.cpu().numpy())

    # Return results
    return {
        "accuracy": accuracy,
        "confusion_matrix": conf_matrix
    }

# **Example Workflow: Model Evaluation**

This example demonstrates the process of evaluating a trained model using preprocessed data. The workflow includes preprocessing the dataset, tokenizing the text for model input, loading the trained model, and performing predictions. Finally, it calculates and outputs the accuracy and confusion matrix, providing insights into the model's performance on the given data.

In [15]:
df = Data_pre_processing(data)
encodings = Text_encoding(df)

model = AutoModelForSequenceClassification.from_pretrained('/content/drive/My Drive/Bachelorprojekt/final_model1') # Specify the path to the trained model directory

# Perform predictions
results = model_prediction(model, encodings, data["class"], threshold=0.5)

# Output results
print(f"Accuracy: {results['accuracy']}")
print(f"Confusion Matrix:\n{results['confusion_matrix']}")

Accuracy: 0.9348914858096828
Confusion Matrix:
[[300  17]
 [ 22 260]]
