# **Loading The Dataset**

In [None]:
import pandas as pd

# Load training data
train_df = pd.read_csv("olid-training-v1.0.tsv", sep="\t")

# Show data preview
train_df.head()


# **EDA**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Check label distribution
label_counts = train_df['subtask_a'].value_counts()
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title("Label Distribution (NOT vs OFF)")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

# Print basic stats
print(train_df['tweet'].str.len().describe())

# **Data Preprocessing**

In [None]:
import re
import emoji

# Basic cleaning functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r"@USER", "", text)
    text = re.sub(r"URL", "", text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply preprocessing
train_df['clean_tweet'] = train_df['tweet'].apply(clean_text)

# Label encoding
label_map = {'NOT': 0, 'OFF': 1}
train_df['label'] = train_df['subtask_a'].map(label_map)

# Show cleaned sample
train_df[['tweet', 'clean_tweet', 'label']].head()


# **BERT**

In [None]:
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import tensorflow as tf
import numpy as np

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Set max token length
MAX_LEN = 128


In [None]:
# Encode the cleaned tweets
def tokenize_data(texts, labels):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_LEN,
        return_tensors="tf"
    )
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        tf.convert_to_tensor(labels)
    ))
    return dataset

# Split into train/val
X_train, X_val, y_train, y_val = train_test_split(
    train_df["clean_tweet"],
    train_df["label"],
    test_size=0.2,
    random_state=42,
    stratify=train_df["label"]
)

# Tokenised datasets
train_dataset = tokenize_data(X_train, y_train).shuffle(1024).batch(32)
val_dataset = tokenize_data(X_val, y_val).batch(32)


In [None]:
# Load BERT for binary classification
model_bert = TFBertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]

model_bert.compile(optimizer=optimizer, loss=loss, metrics=metrics)


In [None]:
history = model_bert.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5, # You can increase this for better performance
)

In [None]:
# Get predictions
y_preds = model_bert.predict(val_dataset)["logits"]
y_pred_classes = np.argmax(y_preds, axis=1)

# Print metrics
print(classification_report(y_val, y_pred_classes, target_names=["NOT", "OFF"]))
print("Macro F1 Score:", f1_score(y_val, y_pred_classes, average='macro'))


# **RoBERTa**

In [None]:
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

# Load tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


In [None]:
# Tokenisation function
def tokenize_roberta(texts, labels):
    encodings = roberta_tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="tf"
    )
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        tf.convert_to_tensor(labels)
    ))
    return dataset

# Create train and validation datasets
train_dataset_roberta = tokenize_roberta(X_train, y_train).shuffle(1024).batch(32)
val_dataset_roberta = tokenize_roberta(X_val, y_val).batch(32)


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]

roberta_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)


In [None]:
history_roberta = roberta_model.fit(
    train_dataset_roberta,
    validation_data=val_dataset_roberta,
    epochs=3  # Adjust as needed
)


In [None]:
# Predict
y_pred_roberta_logits = roberta_model.predict(val_dataset_roberta)["logits"]
y_pred_roberta = np.argmax(y_pred_roberta_logits, axis=1)

# Metrics
print(classification_report(y_val, y_pred_roberta, target_names=["NOT", "OFF"]))
print("Macro F1 Score:", f1_score(y_val, y_pred_roberta, average='macro'))


# **BiLSTM with GloVe**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
import numpy as np


In [None]:
# Tokenise the cleaned tweets
tokenizer_lstm = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer_lstm.fit_on_texts(train_df["clean_tweet"])

# Convert text to sequences
X_seq = tokenizer_lstm.texts_to_sequences(train_df["clean_tweet"])
X_padded = pad_sequences(X_seq, maxlen=128, padding='post', truncating='post')

# Labels
y_lstm = train_df["label"].values

# Split data
X_train_lstm, X_val_lstm, y_train_lstm, y_val_lstm = train_test_split(
    X_padded, y_lstm, test_size=0.2, random_state=42, stratify=y_lstm
)

In [None]:
# Download GloVe (if not already)
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

# Load 100d embeddings
embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

# Create embedding matrix
embedding_dim = 100
word_index = tokenizer_lstm.word_index
embedding_matrix = np.zeros((20000, embedding_dim))

for word, i in word_index.items():
    if i < 20000 and word in embedding_index:
        embedding_matrix[i] = embedding_index[word]


In [None]:
model_lstm = Sequential([
    Embedding(input_dim=20000, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=128, trainable=False),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='softmax')
])


In [None]:
model_lstm.compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam', metrics=['accuracy'])

history_lstm = model_lstm.fit(
    X_train_lstm, y_train_lstm,
    validation_data=(X_val_lstm, y_val_lstm),
    epochs=5, batch_size=32
)


In [None]:
y_pred_lstm = model_lstm.predict(X_val_lstm)
y_pred_labels_lstm = np.argmax(y_pred_lstm, axis=1)

print(classification_report(y_val_lstm, y_pred_labels_lstm, target_names=["NOT", "OFF"]))
print("Macro F1 Score:", f1_score(y_val_lstm, y_pred_labels_lstm, average='macro'))


# **Ensemble Model**

In [None]:
# Ensure all predictions are class labels
if y_preds.ndim > 1:
    y_pred_bert = np.argmax(y_preds, axis=1)

if y_pred_roberta.ndim > 1:
    y_pred_roberta = np.argmax(y_pred_roberta, axis=1)

if y_pred_labels_lstm.ndim > 1:
    y_pred_labels_lstm = np.argmax(y_pred_labels_lstm, axis=1)


In [None]:
import scipy.stats as stats
import numpy as np

# Stack predictions
stacked_preds = np.vstack([y_pred_bert, y_pred_roberta, y_pred_labels_lstm])

# Mode across models (axis=0 → column-wise majority)
ensemble_preds = stats.mode(stacked_preds, axis=0, keepdims=False)[0]


In [None]:
print("🔮 Ensemble Model Performance (Majority Voting):")
print(classification_report(y_val, ensemble_preds, target_names=["NOT", "OFF"]))
print("Macro F1 Score:", f1_score(y_val, ensemble_preds, average='macro'))


In [None]:
model_names = ["BERT", "RoBERTa", "BiLSTM", "Ensemble"]
f1_scores = [
    f1_score(y_val, y_pred_bert, average='macro'),
    f1_score(y_val, y_pred_roberta, average='macro'),
    f1_score(y_val, y_pred_labels_lstm, average='macro'),
    f1_score(y_val, ensemble_preds, average='macro'),
]

import matplotlib.pyplot as plt
plt.bar(model_names, f1_scores, color='skyblue')
plt.title("Macro F1 Score Comparison")
plt.ylabel("Macro F1")
plt.ylim(0, 1)
plt.show()


# **Ensemble Predictions on testset-levela.tsv**

In [None]:
# Load test set
test_df = pd.read_csv("testset-levela.tsv", sep="\t", names=["id", "tweet"], skiprows=1)

# Apply the same cleaning used during training
test_df["clean_tweet"] = test_df["tweet"].apply(clean_text)


In [None]:
bert_inputs = tokenizer(
    test_df["clean_tweet"].tolist(),
    padding=True, truncation=True, max_length=128, return_tensors="tf"
)
bert_logits = model_bert.predict(bert_inputs)["logits"]
bert_preds = np.argmax(bert_logits, axis=1)


In [None]:
roberta_inputs = roberta_tokenizer(
    test_df["clean_tweet"].tolist(),
    padding=True, truncation=True, max_length=128, return_tensors="tf"
)
roberta_logits = roberta_model.predict(roberta_inputs)["logits"]
roberta_preds = np.argmax(roberta_logits, axis=1)


In [None]:
lstm_sequences = tokenizer_lstm.texts_to_sequences(test_df["clean_tweet"])
lstm_padded = pad_sequences(lstm_sequences, maxlen=128, padding='post', truncating='post')
lstm_probs = model_lstm.predict(lstm_padded)
lstm_preds = np.argmax(lstm_probs, axis=1)


In [None]:
from scipy.stats import mode

# Stack and vote
stacked_test_preds = np.vstack([bert_preds, roberta_preds, lstm_preds])
ensemble_test_preds = mode(stacked_test_preds, axis=0, keepdims=False)[0]


In [None]:
# Convert back to original label names
id_to_label = {0: "NOT", 1: "OFF"}
test_df["label"] = [id_to_label[p] for p in ensemble_test_preds]

# Save to CSV
test_df[["id", "label"]].to_csv("final_predictions.csv", index=False)

print("✅ Ensemble predictions saved to final_predictions.csv!")


# **Evaluation on test-set**

In [None]:
# Load predictions
preds_df = pd.read_csv("final_predictions.csv")

# Load gold labels (no header)
gold_df = pd.read_csv("labels-levela.csv", names=["id", "gold_label"])


In [None]:
# Merge on ID
merged_df = preds_df.merge(gold_df, on="id")

# Encode labels for comparison
label_to_id = {"NOT": 0, "OFF": 1}
y_true = merged_df["gold_label"].map(label_to_id).values
y_pred = merged_df["label"].map(label_to_id).values


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=["NOT", "OFF"]))
print("Macro F1 Score :", f1_score(y_true, y_pred, average='macro'))

# Optional: confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["NOT", "OFF"], yticklabels=["NOT", "OFF"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Test Set (Ensemble)")
plt.show()
