In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess_text_lemma_spacy(text):
    doc = nlp(text.lower()) 
    lemmatized_words = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    return ' '.join(lemmatized_words)


In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
def stemming(text):
    words = word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in words])

In [4]:
train_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\test.csv")

In [5]:
date_pattern = r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2}|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},?\s\d{4})\b'
time_pattern = r'\b((0?[1-9]|1[0-2]):[0-5]\d\s?(AM|PM)|([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?)\b'

def preprocess_text(text):
    text = re.sub(r'bin laden', 'Binladen', text, flags=re.IGNORECASE)
    text = re.sub(r"http\S+|www\S+|https\S+", 'http', text, flags=re.MULTILINE)  
    #text = re.sub(r'\@\w+|\#','', text)  
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b(?<!breaking)news\b|\b(?<!breaking)\w*news\w*\b', 'news', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

# train_df['text'] = train_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
# train_df['text'] = train_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
# test_df['text'] = test_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
# test_df['text'] = test_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)
train_df['text'] = train_df['text'].apply(preprocess_text_lemma_spacy)
test_df['text'] = test_df['text'].apply(preprocess_text_lemma_spacy)
train_df['text'] = train_df['text'].apply(stemming)
test_df['text'] = test_df['text'].apply(stemming)



In [6]:
train_df_id = train_df['id']
test_df_id = test_df['id']
X = train_df['text']
y = train_df['target']
X_test = test_df['text']


In [7]:
X.shape, X_test.shape

((7613,), (3263,))

In [8]:
BATCH_SIZES = 128

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X).toarray() 
y = y.values
X_test = vectorizer.transform(X_test).toarray()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZES, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZES,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZES)

In [10]:
import torch.nn as nn
import torch.optim as optim


In [11]:
from timeit import default_timer as timer 
def print_train_time(start: float, end: float, device: torch.device = None):
    
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [12]:
import requests
from pathlib import Path 

if Path("helper_functions.py").is_file():
  print("helper_functions.py already exists, skipping download")
else:
  print("Downloading helper_functions.py")
  request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
  with open("helper_functions.py", "wb") as f:
    f.write(request.content)

helper_functions.py already exists, skipping download


In [13]:
class TweetDisasterModel(nn.Module):
    def __init__(self,input_shape,hidden_units,out_shape):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(in_features=input_shape,out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units,out_features=out_shape)
        )

    def forward(self,X):
        return self.layer(X)

In [14]:
tweet_model = TweetDisasterModel(X_train.shape[1],8,1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(tweet_model.parameters(), lr=0.001, weight_decay=1e-4)

In [15]:
def calculate_metrics(all_labels, all_preds):
    metrics = {
        'accuracy': float(accuracy_score(all_labels, all_preds)),
        'confusion_matrix': confusion_matrix(all_labels, all_preds),  # It's fine to leave the matrix as-is
        'precision': float(precision_score(all_labels, all_preds)),
        'recall': float(recall_score(all_labels, all_preds)),
        'f1': float(f1_score(all_labels, all_preds)),
        'macro_precision': float(precision_score(all_labels, all_preds, average='macro')),
        'macro_recall': float(recall_score(all_labels, all_preds, average='macro')),
        'macro_f1': float(f1_score(all_labels, all_preds, average='macro')),
        'micro_precision': float(precision_score(all_labels, all_preds, average='micro')),
        'micro_recall': float(recall_score(all_labels, all_preds, average='micro')),
        'micro_f1': float(f1_score(all_labels, all_preds, average='micro'))
    }
    
    return metrics, classification_report(all_labels, all_preds, target_names=['ham', 'spam'],digits = 6)

In [16]:
def train_mode(model: torch.nn.Module,data_loader: torch.utils.data.DataLoader, loss_fn:torch.nn.Module, optimizer: torch.optim.Optimizer):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    for batch, (X,y) in enumerate(data_loader):
        y_preds = model(X)
        loss = loss_fn(y_preds, y.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        preds = torch.sigmoid(y_preds).round()  # Apply sigmoid and threshold at 0.5
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        # running_accuracy +=
        if batch % 400 == 0:
            print(f"Looked at {batch * len(X)}/{len(data_loader.dataset)} samples")
    train_loss = running_loss/len(data_loader)
    
    return train_loss, calculate_metrics(all_labels,all_preds)

def test_mode(model: torch.nn.Module, data_loader: torch.utils.data.DataLoader, loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    for batch, (X,y) in enumerate(data_loader):
        y_preds = model(X)
        loss = loss_fn(y_preds, y.unsqueeze(1))
        running_loss += loss.item()
        preds = torch.sigmoid(y_preds).round()  # Apply sigmoid and threshold at 0.5
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        if batch % 400 == 0:
                print(f"Looked at {batch * len(X)}/{len(data_loader.dataset)} samples")
    test_loss = running_loss/len(data_loader)
    
    return test_loss, calculate_metrics(all_labels,all_preds)

def predict_on_test_set(model: torch.nn.Module, test_loader: torch.utils.data.DataLoader):
    model.eval()  
    all_preds = []
    
    with torch.no_grad():  # No need to track gradients during inference
        for batch, X in enumerate(test_loader):
        
            y_preds = model(X[0])
            preds = torch.sigmoid(y_preds).round()  
            all_preds.extend(preds.detach().cpu().numpy()) 

    return all_preds 




In [17]:
from tqdm.auto import tqdm
from timeit import default_timer as timer

# Set the seed for reproducibility
torch.manual_seed(42)

# Start timer
train_time_start_on_cpu = timer()

# Number of epochs
epochs = 13
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n---------")
    
    # Train the model
    train_loss, (train_metrics, train_classification_report)= train_mode(tweet_model, train_loader, criterion, optimizer)
    print(f"Train loss: {train_loss:.5f}")
    # print(f"Train metrics: {train_metrics}")
    print(train_classification_report)
    
    # Test/Validate the model
    test_loss, (test_metrics,test_classification_report) = test_mode(tweet_model, val_loader, criterion, optimizer)
    print(f"Test loss: {test_loss:.5f}")
    # print(f"Test metrics: {test_metrics}")
    print(test_classification_report)

    print("___________________________________")
    
# End timer
train_time_end_on_cpu = timer()

# Calculate total training time
total_train_time_model = print_train_time(start=train_time_start_on_cpu, 
                                           end=train_time_end_on_cpu,
                                           device=str(next(tweet_model.parameters()).device))


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: 0
---------
Looked at 0/6090 samples


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train loss: 0.67853
              precision    recall  f1-score   support

         ham   0.569458  1.000000  0.725675      3468
        spam   0.000000  0.000000  0.000000      2622

    accuracy                       0.569458      6090
   macro avg   0.284729  0.500000  0.362837      6090
weighted avg   0.324283  0.569458  0.413241      6090

Looked at 0/1523 samples
Test loss: 0.66794
              precision    recall  f1-score   support

         ham   0.573867  1.000000  0.729245       874
        spam   0.000000  0.000000  0.000000       649

    accuracy                       0.573867      1523
   macro avg   0.286934  0.500000  0.364622      1523
weighted avg   0.329324  0.573867  0.418490      1523

___________________________________
Epoch: 1
---------
Looked at 0/6090 samples


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train loss: 0.64933
              precision    recall  f1-score   support

         ham   0.597622  1.000000  0.748139      3468
        spam   1.000000  0.109458  0.197319      2622

    accuracy                       0.616585      6090
   macro avg   0.798811  0.554729  0.472729      6090
weighted avg   0.770863  0.616585  0.510988      6090

Looked at 0/1523 samples
Test loss: 0.64171
              precision    recall  f1-score   support

         ham   0.625987  0.997712  0.769299       874
        spam   0.984615  0.197227  0.328626       649

    accuracy                       0.656599      1523
   macro avg   0.805301  0.597469  0.548963      1523
weighted avg   0.778810  0.656599  0.581514      1523

___________________________________
Epoch: 2
---------
Looked at 0/6090 samples
Train loss: 0.60989
              precision    recall  f1-score   support

         ham   0.671714  0.997693  0.802877      3468
        spam   0.991480  0.355072  0.522887      2622

    accuracy      

In [18]:
y_pred = predict_on_test_set(tweet_model, test_loader)
y_pred = [int(pred[0]) for pred in y_pred]
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\simple_nn.csv', index=False)

In [19]:
X_count = train_df['text']
y_count = train_df['target']
X_test_count = test_df['text']

X_train_count, X_val_count, y_train_count, y_val_count = train_test_split(X_count, y_count, test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_count)
X_val_vec = vectorizer.transform(X_val_count)
X_test_vec = vectorizer.transform(X_test_count)

In [20]:
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train_count)
y_pred = nb_model.predict(X_val_vec)
print(f'Accuracy: {accuracy_score(y_val_count, y_pred)}')
print(classification_report(y_val_count, y_pred, target_names=['ham', 'spam'],digits = 6))
y_pred = nb_model.predict(X_test_vec)
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\nb_normal.csv', index=False)

Accuracy: 0.8030203545633617
              precision    recall  f1-score   support

         ham   0.805970  0.864989  0.834437       874
        spam   0.798291  0.719569  0.756888       649

    accuracy                       0.803020      1523
   macro avg   0.802130  0.792279  0.795663      1523
weighted avg   0.802698  0.803020  0.801391      1523



In [21]:
model = LogisticRegression()
model.fit(X_train_vec, y_train_count)

y_pred = model.predict(X_val_vec)
print(f'Accuracy: {accuracy_score(y_val_count, y_pred)}')
print(classification_report(y_val_count, y_pred, target_names=['ham', 'spam'],digits = 6))
y_pred = model.predict(X_test_vec)
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\nb_logisticregression.csv', index=False)

Accuracy: 0.8010505581089954
              precision    recall  f1-score   support

         ham   0.801478  0.868421  0.833608       874
        spam   0.800347  0.710324  0.752653       649

    accuracy                       0.801051      1523
   macro avg   0.800913  0.789372  0.793130      1523
weighted avg   0.800996  0.801051  0.799110      1523



In [29]:
from sklearn.svm import SVC

model = SVC(kernel='rbf') 

model.fit(X_train_vec, y_train_count)

y_pred = model.predict(X_val_vec)

print(f'Accuracy: {accuracy_score(y_val_count, y_pred)}')
print(classification_report(y_val_count, y_pred, target_names=['ham', 'spam'], digits=6))
y_pred = model.predict(X_test_vec)
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

output_df.to_csv(r'D:\Kaggle\disaster tweets\svm_predictions.csv', index=False)

Accuracy: 0.8063033486539725
              precision    recall  f1-score   support

         ham   0.785222  0.911899  0.843833       874
        spam   0.848425  0.664099  0.745030       649

    accuracy                       0.806303      1523
   macro avg   0.816823  0.787999  0.794431      1523
weighted avg   0.812155  0.806303  0.801730      1523



In [31]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Initialize the XGBoost model
model = XGBClassifier(
    n_estimators=100,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage, lower values make the model more conservative
    max_depth=6,  # Maximum tree depth for base learners
    objective='binary:logistic',  # Since you have a binary classification problem
    eval_metric='logloss',  # Evaluation metric, can be 'logloss' or 'error'
    use_label_encoder=False  # Disable label encoding for compatibility
)

# Fit the model on training data
model.fit(X_train_vec, y_train_count)

# Predict on the validation set
y_pred = model.predict(X_val_vec)

# Print accuracy and classification report
print(f'Accuracy: {accuracy_score(y_val_count, y_pred)}')
print(classification_report(y_val_count, y_pred, target_names=['ham', 'spam'], digits=6))

# Predict on the test set
y_pred = model.predict(X_test_vec)

# Create DataFrame for test predictions
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\xgboost_predictions.csv', index=False)


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7800393959290873
              precision    recall  f1-score   support

         ham   0.761397  0.898169  0.824147       874
        spam   0.819106  0.620955  0.706398       649

    accuracy                       0.780039      1523
   macro avg   0.790251  0.759562  0.765272      1523
weighted avg   0.785988  0.780039  0.773970      1523



In [33]:
from lightgbm import LGBMClassifier
import numpy as np

# Convert the training data to float32 or float64
X_train_vec = X_train_vec.astype(np.float32)
X_val_vec = X_val_vec.astype(np.float32)
X_test_vec = X_test_vec.astype(np.float32)

# Initialize LightGBM model
model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=-1,  # No limit on tree depth
    objective='binary',  # For binary classification
    class_weight='balanced'  # Automatically handle class imbalance
)

# Fit the model on training data
model.fit(X_train_vec, y_train_count)

# Predict on the validation set
y_pred = model.predict(X_val_vec)

# Print accuracy and classification report
from sklearn.metrics import accuracy_score, classification_report
print(f'Accuracy: {accuracy_score(y_val_count, y_pred)}')
print(classification_report(y_val_count, y_pred, target_names=['ham', 'spam'], digits=6))

# Predict on the test set
y_pred = model.predict(X_test_vec)

# Create DataFrame for test predictions
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\lightgbm_predictions.csv', index=False)


[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1807
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 668
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Accuracy: 0.7892317793827971
              precision    recall  f1-score   support

         ham   0.815279  0.818078  0.816676       874
        spam   0.753870  0.750385  0.752124       649

    accuracy                       0.789232      1523
   macro avg   0.784575  0.784232  0.784400      1523
weighted avg   0.789111  0.789232  0.789168      1523



In [35]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Initialize AdaBoost model with base estimator as a Decision Tree
model = AdaBoostClassifier(
    n_estimators=100,  # Number of boosting rounds
    learning_rate=0.1  # Step size shrinkage
)

# Fit the model on training data
model.fit(X_train_vec, y_train_count)

# Predict on validation set
y_pred = model.predict(X_val_vec)

# Print accuracy and classification report
print(f'Accuracy: {accuracy_score(y_val_count, y_pred)}')
print(classification_report(y_val_count, y_pred, target_names=['ham', 'spam'], digits=6))

# Predict on the test set
y_pred = model.predict(X_test_vec)

# Create DataFrame for test predictions
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\adaboost_predictions.csv', index=False)




Accuracy: 0.7307944845699278
              precision    recall  f1-score   support

         ham   0.704225  0.915332  0.796020       874
        spam   0.808786  0.482280  0.604247       649

    accuracy                       0.730794      1523
   macro avg   0.756505  0.698806  0.700134      1523
weighted avg   0.748782  0.730794  0.714299      1523

