### Exercise 3: Classification of Airline Tweets with Pretrained Transformers

##### Import the libraries necessary for this project.

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer,TFBertModel,BertConfig,TFBertForSequenceClassification,BertForSequenceClassification
import tensorflow as tf
import warnings

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay

warnings.filterwarnings("ignore")
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


##### Data Preprocessing: Loading the dataset and preprocessing steps as in Exercise 1.

In [2]:
def remove_unwanted(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    emojis = re.compile(
        "[\U0001F600-\U0001F64F" # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]", flags=re.UNICODE
    )
    text = emojis.sub(r'', text)
    return text

def preprocessing(sentence):
    sentence = remove_unwanted(sentence)
    sentence = sentence.lower()
    tokens = word_tokenize(sentence, language='english', preserve_line=True)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    text = " ".join(filtered_tokens)
    return filtered_tokens

tweets_df = pd.read_csv("datasets/Tweets.csv", encoding="utf-8")
tokens = [preprocessing(sentence) for sentence in tweets_df['text']]
tweets_df['text'] = [" ".join(token) for token in tokens]
tweets_df['text']

#### 1. Model Fine-Tuning:

##### Encode sentiment labels and set X as texts and numeric labels as y. Split the data into train and validation.

In [3]:
label_encoder = LabelEncoder()
tweets_df['label'] = label_encoder.fit_transform(tweets_df['airline_sentiment'])
X = tweets_df['text']
y = tweets_df['label']

train_input,val_input,train_label,val_label = train_test_split(X,y)

##### Load a Pretrained Transformer Model: .

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model checkpoints
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
)

# Step 7: Define Trainer
trainer = Trainer(
    model=model,                         # the pretrained model
    args=training_args,                  # training arguments
    train_dataset=train_data,            # training dataset
    eval_dataset=valid_data,             # validation dataset
    compute_metrics=lambda p: {'accuracy': accuracy_score(p.predictions.argmax(axis=-1), p.label_ids)}  # accuracy metric
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Evaluate the model
eval_results = trainer.evaluate()





NameError: name 'TrainingArguments' is not defined

#### 2. Learning Curves:

In [None]:
# Step 10: Plot Learning Curves (Training & Validation Loss)
# Extract loss data from trainer
train_loss = trainer.state.log_history[::2]  # alternate items are training losses
valid_loss = trainer.state.log_history[1::2]  # alternate items are validation losses

train_epochs = [x['epoch'] for x in train_loss]
train_losses = [x['loss'] for x in train_loss]

valid_epochs = [x['epoch'] for x in valid_loss]
valid_losses = [x['eval_loss'] for x in valid_loss]

# Plot loss curves
plt.plot(train_epochs, train_losses, label='Train Loss')
plt.plot(valid_epochs, valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curves')
plt.legend()
plt.show()

#### 3. Confusion Matrix:

In [None]:
# Step 11: Confusion Matrix
# Predict on test data
predictions = trainer.predict(test_data)
pred_labels = predictions.predictions.argmax(axis=-1)
true_labels = test_data['label']

# Generate confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

#### 4. Brief Explanation: