In [1]:
import pandas as pd
import spacy
from sklearn.svm import SVC

import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel

nlp = spacy.load("en_core_web_sm")
def preprocess_text_lemma_spacy(text):
    doc = nlp(text.lower())
    lemmatized_words = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    return ' '.join(lemmatized_words)

def stemming(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in words])
    
def preprocess_text(text):
   
    text = re.sub(r'bin laden', 'Binladen', text, flags=re.IGNORECASE)
    text = re.sub(r"http\S+|www\S+|https\S+", 'http', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b(?<!breaking)news\b|\b(?<!breaking)\w*news\w*\b', 'news', text)
    return text



train_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\test.csv")
train_df['text'] = train_df['text'].apply(preprocess_text_lemma_spacy)
test_df['text'] = test_df['text'].apply(preprocess_text_lemma_spacy)

train_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\test.csv")
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)
train_df['text'] = train_df['text'].apply(preprocess_text_lemma_spacy)
test_df['text'] = test_df['text'].apply(preprocess_text_lemma_spacy)
train_df['text'] = train_df['text'].apply(stemming)
test_df['text'] = test_df['text'].apply(stemming)
train_df_id = train_df['id']
test_df_id = test_df['id']


In [2]:
X = train_df['text']
y = train_df['target']
X_test = test_df['text']

In [3]:
import gensim.downloader as api
import numpy as np

word2vec_model = api.load("word2vec-google-news-300")

def get_avg_word_vector(text):
    words = text.split()
    word_vectors = []
    for word in words:
        if word in word2vec_model:
            word_vectors.append(word2vec_model[word])
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)  # Return zero vector if no words found
    return np.mean(word_vectors, axis=0)



In [4]:

def get_word_vectors(text):
    words = text.split()
    word_vectors = []

    for word in words:
        if word in word2vec_model:
            word_vectors.append(torch.tensor(word2vec_model[word]))
        else:
            word_vectors.append(torch.zeros(word2vec_model.vector_size))  # OOV words as zero vectors

    return torch.stack(word_vectors) if word_vectors else torch.zeros(1, word2vec_model.vector_size)


In [5]:
X = X.apply(get_word_vectors)
X_test = X_test.apply(get_word_vectors)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
def collate_fn(batch: list) -> tuple:    
    length = [len(x) for x in batch]
    X = pad_sequence(X,batch_first=True, padding_value=0)
    return X, torch.tensor(length)

In [9]:
from torch.utils.data import DataLoader, TensorDataset, Dataset


In [10]:
class TweetDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
train_dataset = TweetDataset(X_train)
val_dataset = TweetDataset(X_val)
test_dataset = TweetDataset(X_test)

In [11]:
BATCH_SIZES = 256

In [12]:
# X_train_tensor = torch.tensor(np.vstack(X_train.to_numpy()), dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)
# X_val_tensor = torch.tensor(np.vstack(X_val.to_numpy()), dtype=torch.float32)
# y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.float32)
# X_test_tensor = torch.tensor(np.vstack(X_test.to_numpy()), dtype=torch.float32)

# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
# val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
# test_dataset = TensorDataset(X_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZES, shuffle=True,collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZES,shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZES, collate_fn=collate_fn)

In [13]:
import torch.nn as nn
import torch.optim as optim
from timeit import default_timer as timer 
def print_train_time(start: float, end: float, device: torch.device = None):
    
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [14]:
import requests
from pathlib import Path 

if Path("helper_functions.py").is_file():
  print("helper_functions.py already exists, skipping download")
else:
  print("Downloading helper_functions.py")
  request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
  with open("helper_functions.py", "wb") as f:
    f.write(request.content)

helper_functions.py already exists, skipping download


In [21]:
class TweetRNNModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers, output_size):
        super(TweetRNNModel, self).__init__()
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, lengths):
    
        packed_x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_rnn_out, _ = self.rnn(packed_x)
        rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)
        batch_size = rnn_out.size(0)
        last_outputs = rnn_out[torch.arange(batch_size), lengths - 1]
        out = self.fc(last_outputs)
        

In [22]:
X_train[1].shape

torch.Size([7, 300])

In [23]:
tweet_model = TweetRNNModel(300,16,2,1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(tweet_model.parameters(), lr=0.001, weight_decay=1e-4)

In [24]:
def calculate_metrics(all_labels, all_preds):
    metrics = {
        'accuracy': float(accuracy_score(all_labels, all_preds)),
        'confusion_matrix': confusion_matrix(all_labels, all_preds),  # It's fine to leave the matrix as-is
        'precision': float(precision_score(all_labels, all_preds)),
        'recall': float(recall_score(all_labels, all_preds)),
        'f1': float(f1_score(all_labels, all_preds)),
        'macro_precision': float(precision_score(all_labels, all_preds, average='macro')),
        'macro_recall': float(recall_score(all_labels, all_preds, average='macro')),
        'macro_f1': float(f1_score(all_labels, all_preds, average='macro')),
        'micro_precision': float(precision_score(all_labels, all_preds, average='micro')),
        'micro_recall': float(recall_score(all_labels, all_preds, average='micro')),
        'micro_f1': float(f1_score(all_labels, all_preds, average='micro'))
    }
    
    return metrics, classification_report(all_labels, all_preds, target_names=['ham', 'spam'],digits = 6)

In [25]:
def train_mode(model: torch.nn.Module, data_loader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer, vocab_size: int):
    model.train()
    train_loss = 0
    for X, y, lengths  in data_loader:
       
        output, _ = model(X,lengths)
        loss = loss_fn(output.view(-1,vocab_size), y.view(-1))
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= len(data_loader)
    return train_loss

def test_mode(model: torch.nn.Module, data_loader: torch.utils.data.DataLoader, 
              loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer,  vocab_size: int):
    model.eval()
    test_loss = 0
    with torch.inference_mode():
        for X, y, lengths in data_loader:
            output, _ = model(X,lengths)
            output, hidden = model(X, lengths, hidden)
            loss = loss_fn(output.view(-1,vocab_size), y.view(-1))
            test_loss += loss.item()
            
        test_loss /= len(data_loader)
    return test_loss

In [19]:
def train_mode(model: torch.nn.Module,data_loader: torch.utils.data.DataLoader, loss_fn:torch.nn.Module, optimizer: torch.optim.Optimizer):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    for batch, (X,y) in enumerate(data_loader):
        y_preds = model(X)
        loss = loss_fn(y_preds, y.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        preds = torch.sigmoid(y_preds).round()  # Apply sigmoid and threshold at 0.5
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        # running_accuracy +=
        if batch % 400 == 0:
            print(f"Looked at {batch * len(X)}/{len(data_loader.dataset)} samples")
    train_loss = running_loss/len(data_loader)
    
    return train_loss, calculate_metrics(all_labels,all_preds)

def test_mode(model: torch.nn.Module, data_loader: torch.utils.data.DataLoader, loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    for batch, (X,y) in enumerate(data_loader):
        y_preds = model(X)
        loss = loss_fn(y_preds, y.unsqueeze(1))
        running_loss += loss.item()
        preds = torch.sigmoid(y_preds).round()  # Apply sigmoid and threshold at 0.5
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        if batch % 400 == 0:
                print(f"Looked at {batch * len(X)}/{len(data_loader.dataset)} samples")
    test_loss = running_loss/len(data_loader)
    
    return test_loss, calculate_metrics(all_labels,all_preds)

def predict_on_test_set(model: torch.nn.Module, test_loader: torch.utils.data.DataLoader):
    model.eval()  
    all_preds = []
    
    with torch.no_grad():  # No need to track gradients during inference
        for batch, X in enumerate(test_loader):
        
            y_preds = model(X[0])
            preds = torch.sigmoid(y_preds).round()  
            all_preds.extend(preds.detach().cpu().numpy()) 

    return all_preds 




In [20]:
from tqdm.auto import tqdm
from timeit import default_timer as timer

# Set the seed for reproducibility
torch.manual_seed(42)

# Start timer
train_time_start_on_cpu = timer()

# Number of epochs
epochs = 30
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n---------")
    
    # Train the model
    train_loss, (train_metrics, train_classification_report)= train_mode(tweet_model, train_loader, criterion, optimizer)
    print(f"Train loss: {train_loss:.5f}")
    # print(f"Train metrics: {train_metrics}")
    print(train_classification_report)
    
    # Test/Validate the model
    test_loss, (test_metrics,test_classification_report) = test_mode(tweet_model, val_loader, criterion, optimizer)
    print(f"Test loss: {test_loss:.5f}")
    # print(f"Test metrics: {test_metrics}")
    print(test_classification_report)

    print("___________________________________")
    
# End timer
train_time_end_on_cpu = timer()

total_train_time_model = print_train_time(start=train_time_start_on_cpu, 
                                           end=train_time_end_on_cpu,
                                           device=str(next(tweet_model.parameters()).device))


  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 0
---------


KeyError: 3668

In [None]:
y_pred = predict_on_test_set(tweet_model, test_loader)
y_pred = [int(pred[0]) for pred in y_pred]
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\embedding_nn.csv', index=False)