# **BERT Model**

# **Data Preprocessing**

In [None]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch

In [None]:
# Load the dataset
tf_r2o1_df = pd.read_csv('data_r2o1_partialclean.csv')

In [None]:
# Tokenization and encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

def tokenize_and_encode(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

tf_r2o1_df['encoded'] = tf_r2o1_df['text'].apply(tokenize_and_encode)
X = list(tf_r2o1_df['encoded'].values)
y = tf_r2o1_df['label'].values

# Save the processed data
torch.save((X, y), 'processed_data.pt')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the train/validation split
torch.save((X_train, X_val, y_train, y_val), 'train_val_split.pt')


# **Model Building and Training**

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Create training and validation datasets
train_dataset = Dataset.from_dict({'input_ids': [x['input_ids'] for x in X_train], 'attention_mask': [x['attention_mask'] for x in X_train], 'labels': y_train})
val_dataset = Dataset.from_dict({'input_ids': [x['input_ids'] for x in X_val], 'attention_mask': [x['attention_mask'] for x in X_val], 'labels': y_val})

# Select the pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Training configuration
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Training and evaluation
trainer.train()
trainer.evaluate()

In [None]:
# Save the final model
model.save_pretrained('tf_bert_best_model')
tokenizer.save_pretrained('tf_bert_best_model')

# **Model Evaluation**

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the trained model and tokenizer if needed
model = BertForSequenceClassification.from_pretrained('tf_bert_best_model')
tokenizer = BertTokenizer.from_pretrained('tf_bert_best_model')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions on the validation set
predictions = trainer.predict(val_dataset)
y_pred = torch.argmax(predictions.predictions, axis=1)

# Evaluation metrics
report = classification_report(y_val, y_pred, target_names=['negative', 'neutral', 'positive'])
conf_matrix = confusion_matrix(y_val, y_pred)

print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

# **DistilBERT Model**

# **Data Preprocessing**

In [None]:
import pandas as pd
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split
import torch

In [None]:
# Load the dataset
df = pd.read_csv('data_r2o1_partialclean.csv')

In [None]:
# Tokenization and encoding
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_len = 128

def tokenize_and_encode(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

df['encoded'] = df['text'].apply(tokenize_and_encode)
X = list(df['encoded'].values)
y = df['label'].values

# Save the processed data
torch.save((X, y), 'processed_data_distilbert.pt')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the train/validation split
torch.save((X_train, X_val, y_train, y_val), 'train_val_split_distilbert.pt')

# **Model Building and Training**

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
# Load the train/validation split if needed
X_train, X_val, y_train, y_val = torch.load('train_val_split_distilbert.pt')

In [None]:
# Create training and validation datasets
train_dataset = Dataset.from_dict({'input_ids': [x['input_ids'] for x in X_train], 'attention_mask': [x['attention_mask'] for x in X_train], 'labels': y_train})
val_dataset = Dataset.from_dict({'input_ids': [x['input_ids'] for x in X_val], 'attention_mask': [x['attention_mask'] for x in X_val], 'labels': y_val})

# Select the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Training configuration
training_args = TrainingArguments(
    output_dir='./results_distilbert',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_distilbert',
    save_steps=10_000,  # Save every 10,000 steps
    save_total_limit=2,  # Keep only the last 2 models
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Training and evaluation
trainer.train()

In [None]:
# Save the final model
model.save_pretrained('best_model_distilbert')
tokenizer.save_pretrained('best_model_distilbert')

# **Model Evaluation**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Load the best model if needed
model = DistilBertForSequenceClassification.from_pretrained('best_model_distilbert')
tokenizer = DistilBertTokenizer.from_pretrained('best_model_distilbert')

# Predictions on the validation set
predictions = trainer.predict(val_dataset)
y_pred = torch.argmax(predictions.predictions, axis=1)

# Evaluation metrics
report = classification_report(y_val, y_pred, target_names=['negative', 'neutral', 'positive'])
conf_matrix = confusion_matrix(y_val, y_pred)

print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)