In [None]:
!pip install transformers datasets torch


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Check if GPU is available
print(torch.cuda.is_available())  # Should return True if GPU is enabled

In [None]:


# Load your data (replace this with your actual DataFrame)
# Assuming 'df' is your DataFrame with 'cleaned text' and 'rating' columns
# df = pd.read_csv('your_data.csv')

# Convert rating to sentiment (positive: 1, neutral: 0, negative: -1)
def convert_rating_to_sentiment(rating):
    if rating == 4 or rating == 5:
        return 1  # Positive sentiment
    elif rating == 3:
        return 0  # Neutral sentiment
    else:
        return -1  # Negative sentiment

df['sentiment'] = df['rating'].apply(convert_rating_to_sentiment)

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['cleaned text'], padding='max_length', truncation=True)


In [None]:
# Split data into train and evaluation datasets (80% train, 20% eval)
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

# Create Hugging Face datasets from the DataFrames
train_dataset = Dataset.from_pandas(train_df[['cleaned text', 'sentiment']])
eval_dataset = Dataset.from_pandas(eval_df[['cleaned text', 'sentiment']])

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)  # Move the model to GPU


In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save model at the end of each epoch
    load_best_model_at_end=True,     # Load the best model when finished
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=eval_dataset,           # Evaluation dataset
)

# Train the model
trainer.train()

In [None]:
# Save the fine-tuned model to Google Drive
drive_path = '/content/drive/MyDrive/sentiment_model'
model.save_pretrained(drive_path)
tokenizer.save_pretrained(drive_path)




In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

In [None]:
# Print evaluation results
print("Evaluation Results:", eval_results)