In [None]:
# Necessary imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import csv

In [None]:
# Load data
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")


# Remove irrelevant columns
columns_to_remove = ["keyword", "location"]
training_data = training_data.drop(columns_to_remove, axis=1)
testing_data = testing_data.drop(columns_to_remove, axis=1)

# Separate target column
training_targets = training_data["target"]
training_data = training_data.drop(columns=["target"], axis=1)

In [None]:
# Convert text to lowercase
training_data["text"] = training_data["text"].str.lower()
testing_data["text"] = testing_data["text"].str.lower()

In [None]:
def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

def remove_tags(text):
    return re.sub(r'@\w+', '', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

training_data['text'] = training_data['text'].apply(remove_hashtags).apply(remove_tags).apply(remove_urls)
testing_data['text'] = testing_data['text'].apply(remove_hashtags).apply(remove_tags).apply(remove_urls)

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

training_data['text'] = training_data['text'].apply(remove_stopwords)
testing_data['text'] = testing_data['text'].apply(remove_stopwords)

In [None]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

training_data['text'] = training_data['text'].apply(remove_punctuation)
testing_data['text'] = testing_data['text'].apply(remove_punctuation)

In [None]:
def correct_spelling(text):
    return str(TextBlob(text).correct())

for i in tqdm(range(len(training_data))):
    text = training_data.at[i, 'text']
    corrected_text = correct_spelling(text)
    training_data.at[i, 'text'] = corrected_text

for i in tqdm(range(len(testing_data))):
    text = testing_data.at[i, 'text']
    corrected_text = correct_spelling(text)
    testing_data.at[i, 'text'] = corrected_text

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

training_data['text'] = training_data['text'].apply(lemmatize_text)
testing_data['text'] = testing_data['text'].apply(lemmatize_text)

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
class DisasterDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = self.data.iloc[index]['target']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

max_length = 128  
train_dataset = DisasterDataset(pd.concat([training_data, training_targets], axis=1), tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3 
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {total_loss / len(train_loader)}')

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert_model')

In [None]:
# Load the fine-tuned model and tokenizer
def predict(text, tokenizer, model, device, max_length=128):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).cpu().item()

    return predicted_label


labels = []
for index, row in tqdm(testing_data.iterrows(), total=testing_data.shape[0], desc="Processing rows"):
    text = row['text']
    label = predict(text, tokenizer, model, device)
    labels.append(label)

# Store the results back in the DataFrame
testing_data['label'] = labels

print(testing_data)

In [None]:
filename = 'my_predictions.csv'

predictions_list = testing_data['label'].tolist()
index_list = testing_data['id'].tolist()

with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    
    # Write header row
    csvwriter.writerow(["id", "target"])
    
    # Write data rows
    for index in range(3263):
        id = index_list[index]
        prediction = predictions_list[index]
        csvwriter.writerow([id, prediction])