In [1]:
"""For working in Google Colab"""
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
"""For working in Google Colab"""
cd /content/drive/MyDrive/Colab\ Notebooks

In [1]:
"""Importing Libraries"""
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from PreprocessData import *
from transformers import Trainer, TrainingArguments
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!





In [4]:
"""Longformer Model"""

test_or_final = 0 # 0 for test, 1 for final
from transformers import LongformerForSequenceClassification, LongformerTokenizer
model_name = "allenai/longformer-base-4096"
tokenizer = LongformerTokenizer.from_pretrained(model_name)

if test_or_final:
    """Praparaing and tokenizing all the data for final submission"""
    train_data = read_csv("challenge_data/train_tweets/", tokenizer, 32)
else:
    """Praparaing and tokenizing une file for testing"""
    train_data = process_csv_groupe_by_period(tokenizer, "challenge_data/train_tweets/ArgentinaBelgium72.csv", 32, 4096)

model = LongformerForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


KeyboardInterrupt: 

In [None]:
"""Albert Model"""

test_or_final = 0 # 0 for test, 1 for final
from transformers import AlbertForSequenceClassification, AlbertTokenizer
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)

if test_or_final:
    """Praparaing and tokenizing all the data for final submission"""
    train_data = read_csv("challenge_data/train_tweets/", tokenizer, 32)
else:
    """Praparaing and tokenizing une file for testing"""
    train_data = process_csv(tokenizer, "challenge_data/train_tweets/ArgentinaBelgium72.csv", 32, 4096)

model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

In [5]:
"""Preparing of the clened data for training"""

"""Compute accuracy metric for the trained model"""
def compute_metrics_from_predictions(predictions, labels):
    predicted_classes = np.argmax(predictions, axis=-1)
    accuracy = (predicted_classes == labels).mean()
    return {"accuracy": accuracy}

"""Transform the data into a custom dataset with torch tensors"""
class CustomDataset(Dataset):
    def __init__(self, tweets, labels):
        self.text = [torch.tensor(tweet, dtype=torch.long) for tweet in tweets]
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.text[idx],  
            "attention_mask": torch.ones_like(self.text[idx]), # important for Bert models
            "labels": self.labels[idx]
        }

dataset_train = CustomDataset(train_data['Tweet'], train_data['EventType'])
"""Batching the data and sending it to the GPU"""
train_loader = DataLoader(dataset_train, batch_size=512, shuffle=True, pin_memory=True)

In [None]:
"""Preparing of the nonclened data for training"""

"""Compute accuracy metric for the trained model"""
def compute_metrics_from_predictions(predictions, labels):
    predicted_classes = np.argmax(predictions, axis=-1)
    accuracy = (predicted_classes == labels).mean()
    return {"accuracy": accuracy}

"""Loading of the cleaned data"""
train_data = np.load('train_data.npy', allow_pickle=True)
val_data = np.load('val_data.npy', allow_pickle=True)

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, l=4096):
        self.text = []
        self.labels = []

        for d in data:
            tok_text = tokenizer.encode(d[2], truncation=True, padding="max_length", max_length=l, add_special_tokens=True)
            n = len(tok_text) // l
            ts = [tok_text[i * l:(i + 1) * l] for i in range(n)]
            self.text.append(torch.tensor(ts, dtype=torch.long))
            ls = [d[1] for _ in range(n)]
            self.labels.append(ls)

        self.labels = torch.tensor(self.labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.text[idx],  
            "attention_mask": torch.ones_like(self.text[idx]), # important for Bert models
            "labels": self.labels[idx]
        }

dataset_train = CustomDataset(train_data, tokenizer)
"""Batching the data and sending it to the GPU"""
train_loader = DataLoader(dataset_train, batch_size=512, shuffle=True, pin_memory=True)

In [7]:


# Freeze the embeddings and the encoder layers, only train the classifier
for param in model.longformer.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir='./results',
    fp16=True, # 16-bit floating point
    num_train_epochs=1, 
    weight_decay=1e-12, # Regularization
    logging_dir='./logs',
    evaluation_strategy='no', # No evaluation data
    gradient_accumulation_steps=1  # Accumulate gradients for larger effective batch size
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset_train
)

trainer.train()

"""Compute the training accuracy"""
pred_output = trainer.predict(dataset_train)
logits = pred_output.predictions
labels = pred_output.label_ids
metrics = compute_metrics_from_predictions(logits, labels)
print("Training Accuracy:")
print(metrics)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Step,Training Loss


Training Accuracy:
{'accuracy': 0.6090604026845637}
