In [29]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score
from transformers import pipeline
import mlflow
import mlflow.pytorch

# Initialize MLflow
mlflow.set_experiment("Sentiment Analysis Experiment")

# Check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

# Load dataset
df = pd.read_csv(r"C:\Users\Rajendran\Desktop\kartik\BIZMETRIC\PROJECTS\sentiment_analysis\data\reviews.csv", encoding='latin-1')

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 classes: Positive, Negative

# Move model to GPU (if available)
model.to(device)

# SentimentDataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.texts[item],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[item], dtype=torch.long)
        }

# Prepare dataset
dataset = SentimentDataset(df['Text'].tolist(), df['Sentiment'].tolist(), tokenizer, max_len=128)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 5
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Start MLflow run
with mlflow.start_run() as run:
    # Log hyperparameters
    mlflow.log_param("epochs", epochs)
    mlflow.log_param("learning_rate", 5e-5)
    mlflow.log_param("batch_size", 16)

    # Training loop
    loss_fn = torch.nn.CrossEntropyLoss()
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = loss_fn(logits, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        avg_loss = total_loss / len(train_loader)
        mlflow.log_metric("train_loss", avg_loss, step=epoch)
        print(f"Epoch {epoch + 1} completed, Loss: {avg_loss}")

    # Evaluate the model
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    accuracy = accuracy_score(actuals, predictions)
    mlflow.log_metric("validation_accuracy", accuracy)
    print(f"Validation Accuracy: {accuracy}")

    # Log the model
    mlflow.pytorch.log_model(model, "sentiment_model")
    
     # Register the model
    model_uri = f"runs:/{run.info.run_id}/sentiment_model"
    model_name = "Sentiment-analysis-model"  # Choose a name for your model
    mlflow.register_model(model_uri, model_name)


    # Print the MLflow run URL
    print(f"Model and metrics logged to: {mlflow.get_artifact_uri()}")

# Save the model locally
model.save_pretrained('./sentiment_model')

# Load the model for inference
sentiment_analyzer = pipeline("text-classification", model='./sentiment_model', tokenizer=tokenizer)

# Predict sentiment
result = sentiment_analyzer("I love this!")
print(result)

result = sentiment_analyzer("Today stock market was high")
print(result)


Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed, Loss: 0.4921134604761998
Epoch 2 completed, Loss: 0.08900880413906027
Epoch 3 completed, Loss: 0.015016111622874936
Epoch 4 completed, Loss: 0.014375712518813089
Epoch 5 completed, Loss: 0.004465134271110098
Validation Accuracy: 0.9263157894736842


Successfully registered model 'Sentiment-analysis-model'.
Created version '1' of model 'Sentiment-analysis-model'.


Model and metrics logged to: file:///c:/Users/Rajendran/Desktop/kartik/BIZMETRIC/PROJECTS/ML-Flow%20Sentiment%20Analysis/mlruns/467002745430235884/5e377de169b442588dcbce18bad6b91f/artifacts
[{'label': 'LABEL_1', 'score': 0.9985865354537964}]
[{'label': 'LABEL_0', 'score': 0.9065006375312805}]


In [None]:
sentiment_analyzer = pipeline("text-classification", model='./sentiment_model', tokenizer=tokenizer)
result = sentiment_analyzer("Today stock market was low")
print(result)
