In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess_text_lemma_spacy(text):
    doc = nlp(text.lower()) 
    lemmatized_words = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    return ' '.join(lemmatized_words)


In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
def stemming(text):
    words = word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in words])

In [4]:
train_df = pd.read_csv(r"D:\Kaggle\disaster tweets\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\Kaggle\disaster tweets\nlp-getting-started\test.csv")

In [5]:
date_pattern = r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2}|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},?\s\d{4})\b'
time_pattern = r'\b((0?[1-9]|1[0-2]):[0-5]\d\s?(AM|PM)|([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?)\b'

def preprocess_text(text):
    text = re.sub(r'bin laden', 'Binladen', text, flags=re.IGNORECASE)
    text = re.sub(r"http\S+|www\S+|https\S+", 'http', text, flags=re.MULTILINE)  
    #text = re.sub(r'\@\w+|\#','', text)  
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b(?<!breaking)news\b|\b(?<!breaking)\w*news\w*\b', 'news', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

# train_df['text'] = train_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
# train_df['text'] = train_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
# test_df['text'] = test_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
# test_df['text'] = test_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)
train_df['text'] = train_df['text'].apply(preprocess_text_lemma_spacy)
test_df['text'] = test_df['text'].apply(preprocess_text_lemma_spacy)
train_df['text'] = train_df['text'].apply(stemming)
test_df['text'] = test_df['text'].apply(stemming)



In [6]:
train_df_id = train_df['id']
test_df_id = test_df['id']
X = train_df['text']
y = train_df['target']
X_test = test_df['text']


In [7]:
BATCH_SIZES = 128

In [8]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X).toarray() 
y = y.values
X_test = vectorizer.transform(X_test).toarray()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZES, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZES,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZES)

In [9]:
import torch.nn as nn
import torch.optim as optim


In [10]:
from timeit import default_timer as timer 
def print_train_time(start: float, end: float, device: torch.device = None):
    
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [11]:
import requests
from pathlib import Path 

if Path("helper_functions.py").is_file():
  print("helper_functions.py already exists, skipping download")
else:
  print("Downloading helper_functions.py")
  request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
  with open("helper_functions.py", "wb") as f:
    f.write(request.content)

helper_functions.py already exists, skipping download


In [12]:
class TweetDisasterRNNModel(nn.Module):
    def __init__(self,input_shape,hidden_units,out_shape):
        super().__init__()
        self.rnn = nn.RNN(input_shape, hidden_units, batch_first=True)
        self.fc = nn.Linear(hidden_units, out_shape)

    def forward(self,X):
        X = X.unsqueeze(1)
        output, hidden = self.rnn(X)
        return self.fc(hidden[-1])

In [13]:
tweet_model = TweetDisasterRNNModel(X_train.shape[1],16,1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(tweet_model.parameters(), lr=0.001, weight_decay=1e-4)

In [14]:
def calculate_metrics(all_labels, all_preds):
    metrics = {
        'accuracy': float(accuracy_score(all_labels, all_preds)),
        'confusion_matrix': confusion_matrix(all_labels, all_preds),  # It's fine to leave the matrix as-is
        'precision': float(precision_score(all_labels, all_preds)),
        'recall': float(recall_score(all_labels, all_preds)),
        'f1': float(f1_score(all_labels, all_preds)),
        'macro_precision': float(precision_score(all_labels, all_preds, average='macro')),
        'macro_recall': float(recall_score(all_labels, all_preds, average='macro')),
        'macro_f1': float(f1_score(all_labels, all_preds, average='macro')),
        'micro_precision': float(precision_score(all_labels, all_preds, average='micro')),
        'micro_recall': float(recall_score(all_labels, all_preds, average='micro')),
        'micro_f1': float(f1_score(all_labels, all_preds, average='micro'))
    }
    
    return metrics, classification_report(all_labels, all_preds, target_names=['ham', 'spam'],digits = 6)

In [15]:
def train_mode(model: torch.nn.Module,data_loader: torch.utils.data.DataLoader, loss_fn:torch.nn.Module, optimizer: torch.optim.Optimizer):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    for batch, (X,y) in enumerate(data_loader):
        y_preds = model(X)
        loss = loss_fn(y_preds, y.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        preds = torch.sigmoid(y_preds).round()  # Apply sigmoid and threshold at 0.5
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        # running_accuracy +=
        if batch % 400 == 0:
            print(f"Looked at {batch * len(X)}/{len(data_loader.dataset)} samples")
    train_loss = running_loss/len(data_loader)
    
    return train_loss, calculate_metrics(all_labels,all_preds)

def test_mode(model: torch.nn.Module, data_loader: torch.utils.data.DataLoader, loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    for batch, (X,y) in enumerate(data_loader):
        y_preds = model(X)
        loss = loss_fn(y_preds, y.unsqueeze(1))
        running_loss += loss.item()
        preds = torch.sigmoid(y_preds).round()  # Apply sigmoid and threshold at 0.5
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        if batch % 400 == 0:
                print(f"Looked at {batch * len(X)}/{len(data_loader.dataset)} samples")
    test_loss = running_loss/len(data_loader)
    
    return test_loss, calculate_metrics(all_labels,all_preds)

def predict_on_test_set(model: torch.nn.Module, test_loader: torch.utils.data.DataLoader):
    model.eval()  
    all_preds = []
    
    with torch.no_grad():  # No need to track gradients during inference
        for batch, X in enumerate(test_loader):
        
            y_preds = model(X[0])
            preds = torch.sigmoid(y_preds).round()  
            all_preds.extend(preds.detach().cpu().numpy()) 

    return all_preds 




In [16]:
from tqdm.auto import tqdm
from timeit import default_timer as timer

# Set the seed for reproducibility
torch.manual_seed(42)

# Start timer
train_time_start_on_cpu = timer()

# Number of epochs
epochs = 12
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n---------")
    
    # Train the model
    train_loss, (train_metrics, train_classification_report)= train_mode(tweet_model, train_loader, criterion, optimizer)
    print(f"Train loss: {train_loss:.5f}")
    # print(f"Train metrics: {train_metrics}")
    print(train_classification_report)
    
    # Test/Validate the model
    test_loss, (test_metrics,test_classification_report) = test_mode(tweet_model, val_loader, criterion, optimizer)
    print(f"Test loss: {test_loss:.5f}")
    # print(f"Test metrics: {test_metrics}")
    print(test_classification_report)

    print("___________________________________")
    
# End timer
train_time_end_on_cpu = timer()

total_train_time_model = print_train_time(start=train_time_start_on_cpu, 
                                           end=train_time_end_on_cpu,
                                           device=str(next(tweet_model.parameters()).device))


  0%|          | 0/12 [00:00<?, ?it/s]

Epoch: 0
---------
Looked at 0/6090 samples
Train loss: 0.67596
              precision    recall  f1-score   support

         ham   0.572093  0.993080  0.725970      3468
        spam   0.657143  0.017544  0.034175      2622

    accuracy                       0.573071      6090
   macro avg   0.614618  0.505312  0.380072      6090
weighted avg   0.608711  0.573071  0.428123      6090

Looked at 0/1523 samples
Test loss: 0.66520
              precision    recall  f1-score   support

         ham   0.575379  1.000000  0.730464       874
        spam   1.000000  0.006163  0.012251       649

    accuracy                       0.576494      1523
   macro avg   0.787689  0.503082  0.371358      1523
weighted avg   0.756324  0.576494  0.424410      1523

___________________________________
Epoch: 1
---------
Looked at 0/6090 samples
Train loss: 0.64520
              precision    recall  f1-score   support

         ham   0.601249  0.999135  0.750731      3468
        spam   0.990826  0.12

In [17]:
y_pred = predict_on_test_set(tweet_model, test_loader)
y_pred = [int(pred[0]) for pred in y_pred]
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\simple_rnn.csv', index=False)