In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch
import torch.nn as nn
import re
from collections import Counter
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)




In [2]:
def preprocess_text(text):
    text = re.sub(r"https?://\S+|www\.\S+", "http", text)
    text = re.sub(r'\\@\\w+|\\#','', text)
    text = re.sub(r'#(\\w+)', r'\\1', text)
    return text

In [3]:
train_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\test.csv")

In [4]:
train_sentences = train_df["text"].apply(preprocess_text)
train_labels = train_df['target']
test_sentences = test_df["text"].apply(preprocess_text)

In [5]:
train_sentences

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610      M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [6]:
test_id = test_df['id']

In [7]:
class TweetDataset(Dataset):
    def __init__(self, sentence, tokenizer, max_length, labels= None):
        self.sentence = sentence
        self.labels = labels
        self.tokenizer = tokenizer
        self.maxlength = max_length

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self,idx):
        encoded = self.tokenizer(self.sentence[idx], padding="max_length", max_length=self.maxlength, truncation=True, return_tensors="pt")
        if self.labels is not None:
            return {
                "input_ids": encoded["input_ids"].squeeze(0),
                "token_ids": encoded["token_type_ids"].squeeze(0),
                "attention_mask": encoded["attention_mask"].squeeze(0),
                "labels": torch.tensor(self.labels[idx], dtype=torch.long)  
            }
        return {
                "input_ids": encoded["input_ids"].squeeze(0),
                "token_ids": encoded["token_type_ids"].squeeze(0),
                "attention_mask": encoded["attention_mask"].squeeze(0)
            }
        

In [8]:
data = TweetDataset(train_sentences, tokenizer, max_length=64, labels = train_labels)

In [9]:
train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_dataset, val_dataset = random_split(data, [train_size, val_size])
test_dataset = TweetDataset(test_sentences, tokenizer, max_length=64,labels = None)

In [10]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True if torch.cuda.is_available() else False
)

# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
probs = torch.softmax(torch.tensor(logits), dim=-1)  # Convert logits to probabilities
pred_labels = torch.argmax(probs, dim=-1)


In [None]:
df = pd.DataFrame({"id": test_id, "target": pred_labels.tolist()})
df.to_csv(r"C:\\users\\harish-4072\\downloads\\tweet_predictions.csv", index=False)

In [12]:
-np.log(1/3)

1.0986122886681098