<a href="https://colab.research.google.com/github/jaarck/ai_learn_nlp/blob/main/DisasterTweetClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
##  To delete all folders and their contents under the /content directory in my Google Colab
##  Use this to clear all the contents that we had downloaded earlier or if you want to use a different dataset.
import shutil
import os

# Define the directory path
content_dir = '/content'

# Remove all folders and their contents under /content
for item in os.listdir(content_dir):
    item_path = os.path.join(content_dir, item)
    if os.path.isdir(item_path):
        shutil.rmtree(item_path)

print("All folders under /content have been deleted.")


All folders under /content have been deleted.


In [None]:
# Upload your Kaggle API key. FYI - I have already uploaded mine (jarockia) to run the code
from google.colab import files
files.upload()  # Upload kaggle.json

# Set up Kaggle API credentials. Make sure this is done right as dataset access is dependent on this step.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle API
!pip install kaggle





Saving kaggle.json to kaggle.json
403 - Forbidden - You must accept this competition's rules before you'll be able to download files.


In [None]:
# Download a specific dataset from kaggle
!kaggle competitions download -c nlp-getting-started

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 107MB/s]


In [None]:
import os
import zipfile
import shutil
from collections import defaultdict

# Define directory and zip file names
dataset_dir = '/content/DisasterTweetClassification'
zip_file = 'nlp-getting-started.zip'

# Remove existing content if the directory exists
if os.path.exists(dataset_dir):
    shutil.rmtree(dataset_dir)

# Unzip the file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)


# Function to count total files in directories up to a specified depth
def count_files_in_directories(root_dir, max_depth=3):
    # Dictionary to hold file counts
    file_counts = defaultdict(int)

    # Walk through the directory
    for root, dirs, files in os.walk(root_dir):
        # Calculate the current depth
        depth = root.replace(root_dir, '').count(os.sep)

        # Only count if within the desired depth
        if depth <= max_depth:
            # Count files in the current directory
            file_counts[root] = len(files)

            # Only process directories up to the max depth
            if depth == max_depth:
                # Remove subdirectories from further exploration
                dirs[:] = []

    # Print total file count for each directory including subdirectories
    def count_files_in_dir(directory):
        total_files = 0
        for root, _, files in os.walk(directory):
            total_files += len(files)
        return total_files

    print("Total files in each directory:")
    for directory in file_counts.keys():
        total_files = count_files_in_dir(directory)
        print(f'{directory}: {total_files} files')

# Execute to check if extraction was successful
count_files_in_directories('/content')

Total files in each directory:
/content: 5 files
/content/DisasterTweetClassification: 3 files


In [None]:
import pandas as pd
import tensorflow as tf
import os

# Paths to the extracted dataset
extract_path = '/content/DisasterTweetClassification'

# Check available GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth on all GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

        # Print the name of the GPU being used
        print(f"Running on GPU: {tf.config.experimental.get_device_details(gpus[0])['device_name']}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU available, running on CPU.")

# Load the train and test datasets
train_file_path = os.path.join(extract_path, 'train.csv')
test_file_path = os.path.join(extract_path, 'test.csv')

# Load datasets into pandas DataFrames
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Preview the loaded train data
print("Train Data: ", train_data.head())

# Preview the loaded test data
print("Test Data: ", test_data.head())


Running on GPU: Tesla T4
Train Data:     id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
Test Data:     id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

# Text preprocessing (basic)
train_data['text'] = train_data['text'].str.lower()

# Split the data
X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['target'], test_size=0.2, random_state=42)

# Vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

# Function to train and evaluate models
def train_evaluate_model(model, model_name):
    print(f"\nModel: {model_name}")
    model.fit(X_train_tfidf, y_train)  # Train the model
    y_pred = model.predict(X_val_tfidf)  # Predict on validation set
    print(classification_report(y_val, y_pred))  # Print classification report

# Logistic Regression
lr_model = LogisticRegression()
train_evaluate_model(lr_model, "Logistic Regression")

# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
train_evaluate_model(svm_model, "Support Vector Machine (SVM)")

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
train_evaluate_model(rf_model, "Random Forest Classifier")

# XGBoost Classifier
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss')
train_evaluate_model(xgb_model, "XGBoost Classifier")

# Multinomial Naive Bayes
nb_model = MultinomialNB()
train_evaluate_model(nb_model, "Multinomial Naive Bayes")


Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       874
           1       0.81      0.68      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523


Model: Support Vector Machine (SVM)
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       874
           1       0.79      0.71      0.75       649

    accuracy                           0.80      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.80      0.80      0.79      1523


Model: Random Forest Classifier
              precision    recall  f1-score   support

           0       0.77      0.89      0.83       874
           1       0.81      0.64      0.72       649

    accuracy                           0.78      1523
   macro avg       0.79      0

In [None]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch


In [None]:
# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_data['text'], train_data['target'], test_size=0.2, random_state=42)


In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
# Convert the datasets into the Hugging Face Dataset format
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})
val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': val_labels})

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy='epoch',     # evaluation strategy to run validation each epoch
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.3594,0.411789
2,0.3092,0.44388
3,0.3057,0.540559


TrainOutput(global_step=1143, training_loss=0.37258730337271434, metrics={'train_runtime': 348.34, 'train_samples_per_second': 52.449, 'train_steps_per_second': 3.281, 'total_flos': 788654832890400.0, 'train_loss': 0.37258730337271434, 'epoch': 3.0})

In [None]:
# Evaluate the model
trainer.evaluate()


{'eval_loss': 0.5405588150024414,
 'eval_runtime': 6.1779,
 'eval_samples_per_second': 246.524,
 'eval_steps_per_second': 3.885,
 'epoch': 3.0}

In [None]:
# Predict on validation data
predictions = trainer.predict(val_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Display classification report
from sklearn.metrics import classification_report
print(classification_report(val_labels, predicted_labels))


              precision    recall  f1-score   support

           0       0.85      0.85      0.85       874
           1       0.80      0.80      0.80       649

    accuracy                           0.83      1523
   macro avg       0.83      0.83      0.83      1523
weighted avg       0.83      0.83      0.83      1523

