In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the dataset
file_path = 'all_facebook_and_twitter_dataset .xlsx'
df = pd.read_excel(file_path)

print("Dataset loaded successfully.")

# Preprocessing: Tokenize the text data and encode labels
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode comments
def tokenize_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt')

tokenized_texts = tokenize_texts(df['Comments'])

print("Text tokenization completed.")

# Encode labels
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['M-Class'] = df['M-Class'].map(label_map)

print("Labels encoded successfully.")

# Ensure the DataFrame and tokenized texts are of the same length
df = df.iloc[:len(tokenized_texts['input_ids'])]

# Prepare features and labels for splitting
X = tokenized_texts['input_ids']
y = df['M-Class'].values

# Split the original data into 80% training and 20% temporary set
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Now split the temporary set into validation and test sets (10% of original data each)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42  # 50% of temporary set for validation and test each
)

print("Data split into training, validation, and test sets.")

# Convert the datasets to torch Dataset format
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert tokenized texts to a dictionary with keys 'input_ids' for consistency
train_encodings = {'input_ids': X_train}
val_encodings = {'input_ids': X_val}
test_encodings = {'input_ids': X_test}

train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_val)
test_dataset = CustomDataset(test_encodings, y_test)

print("Datasets converted to torch format.")

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded and moved to device:", device)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)

# Define the evaluation metric
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print("Starting training...")
trainer.train()
print("Training completed.")

# Evaluate the model
print("Starting evaluation...")
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

# Print the classification report
print("Starting prediction on test set...")
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = y_test

print("Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=label_map.keys()))


Dataset loaded successfully.
Text tokenization completed.
Labels encoded successfully.
Data split into training, validation, and test sets.
Datasets converted to torch format.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to device: cpu
Starting training...


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
