In [24]:
# Necessary imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AdamW
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import csv
import random
from random import shuffle
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [25]:
# Load data
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")

# Remove irrelevant columns
columns_to_remove = ["keyword", "location"]
training_data = training_data.drop(columns_to_remove, axis=1)
testing_data = testing_data.drop(columns_to_remove, axis=1)

In [26]:
# Convert text to lowercase
training_data["text"] = training_data["text"].str.lower()
testing_data["text"] = testing_data["text"].str.lower()

In [27]:
def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

def remove_tags(text):
    return re.sub(r'@\w+', '', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

training_data['text'] = training_data['text'].apply(remove_hashtags).apply(remove_tags).apply(remove_urls)
testing_data['text'] = testing_data['text'].apply(remove_hashtags).apply(remove_tags).apply(remove_urls)

In [28]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

training_data['text'] = training_data['text'].apply(remove_stopwords)
testing_data['text'] = testing_data['text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ioannisdrossas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

training_data['text'] = training_data['text'].apply(remove_punctuation)
testing_data['text'] = testing_data['text'].apply(remove_punctuation)

In [30]:
def correct_spelling(text):
    return str(TextBlob(text).correct())

for i in tqdm(range(len(training_data))):
    text = training_data.at[i, 'text']
    corrected_text = correct_spelling(text)
    training_data.at[i, 'text'] = corrected_text

for i in tqdm(range(len(testing_data))):
    text = testing_data.at[i, 'text']
    corrected_text = correct_spelling(text)
    testing_data.at[i, 'text'] = corrected_text

In [31]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

training_data['text'] = training_data['text'].apply(lemmatize_text)
testing_data['text'] = testing_data['text'].apply(lemmatize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ioannisdrossas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ioannisdrossas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ioannisdrossas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [32]:
#taken from EDA [https://github.com/jasonwei20/eda_nlp/blob/master/code/eda.py]
# Easy data augmentation techniques for text classification
# Jason Wei and Kai Zou

def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break

	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word):
		for l in syn.lemmas():
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym)
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, max(0,len(new_words)-1))]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	words = sentence.split(' ')
	words = [word for word in words if word != '']
	num_words = len(words)

	augmented_sentences = []
	num_new_per_technique = int(num_aug/4)+1

	#sr
	if (alpha_sr > 0):
		n_sr = max(1, int(alpha_sr*num_words))
		for _ in range(num_new_per_technique):
			a_words = synonym_replacement(words, n_sr)
			augmented_sentences.append(' '.join(a_words))

	#ri
	if (alpha_ri > 0):
		n_ri = max(1, int(alpha_ri*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_insertion(words, n_ri)
			augmented_sentences.append(' '.join(a_words))

	#rs
	if (alpha_rs > 0):
		n_rs = max(1, int(alpha_rs*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_swap(words, n_rs)
			augmented_sentences.append(' '.join(a_words))

	#rd
	if (p_rd > 0):
		for _ in range(num_new_per_technique):
			a_words = random_deletion(words, p_rd)
			augmented_sentences.append(' '.join(a_words))

	shuffle(augmented_sentences)

	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	augmented_sentences.append(sentence)

	return augmented_sentences

In [33]:
counter = 0
for index, row in training_data.iterrows():
    if len(row['text']) == 0:
        continue
    dummy = eda(row['text'], num_aug=3)

    for i in dummy:
        new_entry = pd.DataFrame([{'text': i, 'target': row['target']}])
        training_data = pd.concat([training_data, new_entry], ignore_index=True)

training_targets = training_data["target"]
training_data = training_data.drop(columns=["target"], axis=1)

In [34]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
class DisasterDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = self.data.iloc[index]['target']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

combined_data = pd.concat([training_data, training_targets], axis=1)
train_data, val_data = train_test_split(combined_data, test_size=0.2, random_state=42)

max_length = 128
train_dataset = DisasterDataset(train_data, tokenizer, max_length)
val_dataset = DisasterDataset(val_data, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [36]:
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 10

# Initialize lists to store the training and validation loss
train_losses = []
val_losses = []

# Variable to store the best validation loss
best_val_loss = float('inf')
best_model_path = './best_model.pt'

for epoch in range(num_epochs):
    # Training phase
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Validation Epoch {epoch + 1}/{num_epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Validation Loss: {avg_val_loss}')

    # Save the model if the validation loss is the best we've seen so far.
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), best_model_path)
        print(f'New best model saved at epoch {epoch + 1} with validation loss: {avg_val_loss}')

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert_model')

# Plot the training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

Epoch 1/10:   1%|          | 2/191 [00:31<49:52, 15.83s/it]


KeyboardInterrupt: 

In [None]:
# Load the fine-tuned model and tokenizer
def predict(text, tokenizer, model, device, max_length=128):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).cpu().item()

    return predicted_label


labels = []
for index, row in tqdm(testing_data.iterrows(), total=testing_data.shape[0], desc="Processing rows"):
    text = row['text']
    label = predict(text, tokenizer, model, device)
    labels.append(label)

# Store the results back in the DataFrame
testing_data['label'] = labels

In [None]:
filename = 'my_predictions.csv'

predictions_list = testing_data['label'].tolist()
index_list = testing_data['id'].tolist()

with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    # Write header row
    csvwriter.writerow(["id", "target"])

    # Write data rows
    for index in range(5):
        id = index_list[index]
        prediction = predictions_list[index]
        csvwriter.writerow([id, prediction])