# SETUP

In [1]:
# This automates data tabulation onto google sheets 

import gspread
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build

import os

# new directory path
new_directory = '/Users/levan/ATENEO MASTERAL/Thesis'

# Change the current working directory
os.chdir(new_directory)

# Use creds to create a client to interact with the Google Drive API
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('thesis-432315-12daec8d1ff6.json', scope)

service = build('sheets', 'v4', credentials=creds)

client = gspread.authorize(creds)

spreadsheet_id = '1ReMgKOtkETAbtnnqkJy0eo4M2QfIq9fxtV-5ysZSdJ8' 

# Hard Voting

## Load Dataset

In [2]:
import os

# Specify the new directory path
new_directory = '/Users/levan/ATENEO MASTERAL/Thesis/Development'

# Change the current working directory
os.chdir(new_directory)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import random


# Adjust file paths to your local system
file_path = 'Corpus/data_b.csv'

df = pd.read_csv(file_path)

# Split the data into training and test sets (70-30 split)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Extract texts and labels
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()

## Load Models and Tokenizers

In [4]:
def load_model_and_tokenizer(model_path, tokenizer_path, base_model):
    # Load the tokenizer from the local directory
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    
    # Load the configuration from the base model, then update configuration if needed
    config = AutoConfig.from_pretrained(base_model, num_labels=2)

    # Initialize the model with the configuration
    model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)

    # Ensure the model is in evaluation mode
    model.eval()
    
    return model, tokenizer

model_info = {

    'BERT cased': {
        'model_path': 'BERT models/hs_bert-base-cased-finetuned',
        'tokenizer_path': 'BERT models/hs_bert-base-cased-finetuned',
        'base_model': 'google-bert/bert-base-cased'
    },

    'DistilBERT uncased': {
        'model_path': 'BERT models/hs_distilbert-base-uncased-finetuned',
        'tokenizer_path': 'BERT models/hs_distilbert-base-uncased-finetuned',
        'base_model': 'distilbert/distilbert-base-uncased'
    },

    'DeBERTa': {
        'model_path': 'BERT models/hs_DeBERTa-finetuned',
        'tokenizer_path': 'BERT models/hs_DeBERTa-finetuned',
        'base_model': 'microsoft/deberta-v3-base'
    }

}

models_and_tokenizers = {name: load_model_and_tokenizer(info['model_path'], 
                                                        info['tokenizer_path'], 
                                                        info['base_model']) 
                         for name, info in model_info.items()}

## Apply Tokenization

In [5]:
def texts_to_dataloader(texts, tokenizer, batch_size=32):
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

    # Move tensors to the specified device
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])
    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader

## Perform Hard Voting and Prediction

In [6]:
%load_ext memory_profiler

In [7]:
%%memit

def hard_voting_predict(models_and_tokenizers, texts):
    votes = []
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    
    # Loop through each model and its corresponding tokenizer
    for name, (model, tokenizer) in models_and_tokenizers.items():
        # Explicitly move each model to the MPS device if available
        model.to(device)
        
        dataloader = texts_to_dataloader(texts, tokenizer)
        model_preds = []
        for batch in dataloader:
            input_ids, attention_mask = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=-1)
                model_preds.extend(preds.cpu().numpy())
        votes.append(model_preds)

    # Transpose to get lists of votes per sample
    votes = np.array(votes).T  
    # Perform voting
    final_preds = []
    for vote in votes:
        vote_count = np.bincount(vote)
        max_votes = np.max(vote_count)
        candidates = np.where(vote_count == max_votes)[0]  # find all classes with the maximum votes
        if len(candidates) > 1:
            final_preds.append(random.choice(candidates))  # randomly choose among the candidates
        else:
            final_preds.append(candidates[0])  # choose the single candidate
    return final_preds


# Perform inference and voting
final_predictions = hard_voting_predict(models_and_tokenizers, test_texts)


peak memory: 2080.11 MiB, increment: 182.09 MiB


## Evaluate Model

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Calculate precision, recall, accuracy, and F1 score
precision = precision_score(test_labels, final_predictions, average='binary')  # Adjust 'binary' as needed
recall = recall_score(test_labels, final_predictions, average='binary')  # Adjust 'binary' as needed
accuracy = accuracy_score(test_labels, final_predictions)  # Use the original test_labels list
f1 = f1_score(test_labels, final_predictions, average='binary')  # Adjust 'binary' as needed

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, final_predictions)

# Print the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Ensemble accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.7456931911402789
Recall: 0.7952755905511811
Ensemble accuracy: 0.7817890092258323
F1 Score: 0.7696867061812024
Confusion Matrix:
[[1040  310]
 [ 234  909]]


In [9]:
# SAVE TO GOOGLE SHEET

# Define the range and values to update
range_name = '3-1!B2:E2'  

values = [[
    f"{precision * 100:.2f}",
    f"{recall * 100:.2f}",
    f"{accuracy * 100:.2f}",
    f"{f1 * 100:.2f}"
]]

# Prepare the request body
body = {
    'values': values
}

# Call the Sheets API to update the values
result = service.spreadsheets().values().update(
    spreadsheetId=spreadsheet_id, 
    range=range_name,
    valueInputOption='USER_ENTERED',
    body=body
).execute()

print('Updated cells count:', result.get('updatedCells'))

Updated cells count: 4


# Validate on Data C

## Load Dataset

In [10]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load validation data
validation_file_path = 'Corpus/data_c.csv'
validation_df = pd.read_csv(validation_file_path)


# Prepare the validation texts and labels
validation_texts = validation_df['text'].tolist()
validation_labels = validation_df['label'].values  

## Perform Hard Voting

In [11]:
%%memit
# Perform inference and voting on the validation texts
validation_predictions = hard_voting_predict(models_and_tokenizers, validation_texts)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


peak memory: 2266.31 MiB, increment: 116.14 MiB


## Evaluate Model

In [12]:
# Calculate precision, recall, accuracy, and F1 score
precision = precision_score(validation_labels, validation_predictions, average='binary')  
recall = recall_score(validation_labels, validation_predictions, average='binary')  
accuracy = accuracy_score(validation_labels, validation_predictions)
f1 = f1_score(validation_labels, validation_predictions, average='binary')  

# Generate confusion matrix
conf_matrix = confusion_matrix(validation_labels, validation_predictions)

# Print the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Validation accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.7689804772234273
Recall: 0.8244186046511628
Validation accuracy: 0.8029236599891716
F1 Score: 0.7957351290684624
Confusion Matrix:
[[774 213]
 [151 709]]


In [13]:
# SAVE TO GOOGLE SHEET

# Define the range and values to update
range_name = '3-1!F2:I2'  

values = [[
    f"{precision * 100:.2f}",
    f"{recall * 100:.2f}",
    f"{accuracy * 100:.2f}",
    f"{f1 * 100:.2f}"
]]

# Prepare the request body
body = {
    'values': values
}

# Call the Sheets API to update the values
result = service.spreadsheets().values().update(
    spreadsheetId=spreadsheet_id, 
    range=range_name,
    valueInputOption='USER_ENTERED',
    body=body
).execute()

print('Updated cells count:', result.get('updatedCells'))

Updated cells count: 4


In [14]:
import altair as alt
import pandas as pd
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(validation_labels, validation_predictions)

# Define class names
class_names = ['Hate Speech', 'Non Hatespeech']

# Convert confusion matrix to DataFrame
cm_df = pd.DataFrame(cm, index=class_names, columns=class_names).reset_index().melt(id_vars='index')
cm_df.columns = ['True', 'Predicted', 'Count']

# Ensure the order of categories
cm_df['True'] = pd.Categorical(cm_df['True'], categories=class_names, ordered=True)
cm_df['Predicted'] = pd.Categorical(cm_df['Predicted'], categories=class_names, ordered=True)

# Create the Altair plot
heatmap = alt.Chart(cm_df).mark_rect().encode(
    x=alt.X('Predicted:O', sort=class_names),
    y=alt.Y('True:O', sort=class_names),
    color='Count:Q',
    tooltip=['True', 'Predicted', 'Count']
).properties(
    width=400,
    height=300,
    title='3 Blending-1 DT'
)

# Add text labels
text = heatmap.mark_text(
    align='center',
    baseline='middle',
    fontSize=12
).encode(
    text='Count:Q',
    color=alt.condition(
        alt.datum.Count > cm.max() / 2,
        alt.value('white'),
        alt.value('black')
    )
)

# Combine heatmap and text
final_chart = heatmap + text

# Display the plot
final_chart.show()

In [15]:
# Specify the folder path
folder_path = os.path.expanduser('Results/Ensemble Model Results/On Hatespeech dataset/OH Using HS_Finetuned Models/Hard Voting/')

# Save the plot using vl-convert
file_path_png = os.path.join(folder_path, '3HV-1.png')
final_chart.save(file_path_png)

print(f"Plot saved to {file_path_png}")

Plot saved to Results/Ensemble Model Results/On Hatespeech dataset/OH Using HS_Finetuned Models/Hard Voting/3HV-1.png
