In [5]:
import pandas as pd
import os
import glob
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax # For converting logits to probabilities
import numpy as np

In [14]:
# --- 1. Configuration ---
# Define the directory where cleaned data is stored
CLEANED_DATA_DIR = "../data/cleaned"
# Define the directory where sentiment analysis results will be saved
SENTIMENT_RESULTS_DIR = "../data/sentiment_analysis"

# Hugging Face model for sentiment analysis
SENTIMENT_MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"

# Ensure the output directory for sentiment results exists
if not os.path.exists(SENTIMENT_RESULTS_DIR):
    os.makedirs(SENTIMENT_RESULTS_DIR)
    print(f"Created sentiment results directory: {SENTIMENT_RESULTS_DIR}")

In [8]:
# --- 2. Load Model and Tokenizer ---
print(f"Loading sentiment model: {SENTIMENT_MODEL_NAME}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        SENTIMENT_MODEL_NAME)
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Model loaded successfully and moved to {device}.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    print("Please ensure you have an active internet connection and `transformers` library is installed.")
    # Exit or handle gracefully if model cannot be loaded

Loading sentiment model: distilbert-base-uncased-finetuned-sst-2-english...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model loaded successfully and moved to cpu.


In [9]:
# --- 3. Data Loading Function ---
def load_cleaned_data(directory):
    """
    Loads all cleaned CSV files from the specified directory into a list of DataFrames.

    Args:
        directory (str): Path to the directory containing cleaned CSV files.

    Returns:
        dict: A dictionary where keys are bank names (derived from filenames)
              and values are pandas DataFrames containing the cleaned reviews.
    """
    all_files = glob.glob(os.path.join(directory, "*_cleaned.csv"))
    if not all_files:
        print(
            f"No cleaned CSV files found in {directory}. Please ensure Task 1 was run correctly.")
        return {}

    bank_data = {}
    for filepath in all_files:
        try:
            df = pd.read_csv(filepath)
            # Extract bank name from filename
            # Example: "Commercial_Bank_of_Ethiopia_mobile_banking_reviews_cleaned.csv"
            # -> "Commercial Bank of Ethiopia"
            filename = os.path.basename(filepath)
            bank_name = filename.replace(
                "_mobile_banking_reviews_cleaned.csv", "").replace("_", " ")
            bank_data[bank_name] = df
            print(
                f"Loaded {len(df)} reviews for '{bank_name}' from {filename}")
        except Exception as e:
            print(f"Error loading {filepath}: {e}")
    return bank_data

In [10]:

# --- 4. Sentiment Analysis Function ---
def analyze_sentiment(texts, tokenizer, model, device, batch_size=32):
    """
    Computes sentiment scores and labels for a list of texts using the pre-trained model.

    Args:
        texts (list): A list of review texts.
        tokenizer: Loaded Hugging Face tokenizer.
        model: Loaded Hugging Face sentiment model.
        device (torch.device): Device to run the model on (cpu or cuda).
        batch_size (int): Number of texts to process in each batch.

    Returns:
        tuple: A tuple containing:
            - list: List of sentiment labels ('positive', 'negative').
            - list: List of sentiment scores (probability of the predicted label).
    """
    if not texts:
        return [], []

    sentiment_labels = []
    sentiment_scores = []

    # Process texts in batches to improve efficiency
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        # Tokenize and move to device
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=model.config.max_position_embeddings,  # Use model's max length
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            # Logits are the raw outputs from the model before activation (like softmax)
            logits = outputs.logits

        # Apply softmax to convert logits to probabilities
        # Move back to CPU for numpy operation
        probabilities = softmax(logits.cpu().numpy(), axis=1)

        # The SST-2 model usually outputs 2 classes: 0 for negative, 1 for positive.
        # We can verify this from model.config.id2label if needed.
        # model.config.id2label will typically be {0: 'LABEL_0', 1: 'LABEL_1'}
        # and from documentation, LABEL_0 is negative, LABEL_1 is positive.

        for j in range(len(batch_texts)):
            # Get the index of the highest probability
            predicted_class_id = np.argmax(probabilities[j])
            # Score for the predicted label
            score = probabilities[j, predicted_class_id]

            # Map the predicted class ID to sentiment label
            # Based on standard SST-2 fine-tuning: 0 -> negative, 1 -> positive
            if predicted_class_id == 0:
                label = 'negative'
            else:  # predicted_class_id == 1
                label = 'positive'

            sentiment_labels.append(label)
            # Ensure float for CSV saving
            sentiment_scores.append(float(score))

    print(f"Analyzed sentiment for {len(texts)} reviews.")
    return sentiment_labels, sentiment_scores

In [11]:
# --- 5. Aggregation Function ---
def aggregate_sentiment(df):
    """
    Aggregates sentiment by bank and rating, computing mean sentiment scores.

    Args:
        df (pd.DataFrame): DataFrame with 'bank', 'rating', and 'sentiment_score' columns.

    Returns:
        pd.DataFrame: A DataFrame with aggregated sentiment results.
    """
    print("\n--- Aggregating sentiment data ---")

    # Convert sentiment_label to a numerical value for mean calculation
    # For simplicity, assign -1 for negative, 1 for positive. Neutral can be 0 if we implement it.
    df['numerical_sentiment'] = df['sentiment_label'].apply(
        # 0 for neutral if present
        lambda x: 1 if x == 'positive' else (-1 if x == 'negative' else 0)
    )

    # Aggregate by bank and rating
    # We'll calculate the mean of 'numerical_sentiment'
    # and also count reviews for context
    aggregated_df = df.groupby(['bank', 'rating']).agg(
        mean_sentiment=('numerical_sentiment', 'mean'),
        num_reviews=('review', 'count')
    ).reset_index()

    print("Sentiment aggregation complete.")
    return aggregated_df

In [15]:
# --- Main Execution ---
if __name__ == "__main__":
    print("Starting sentiment analysis pipeline...")

    # Load all cleaned data
    all_banks_data = load_cleaned_data(CLEANED_DATA_DIR)

    if not all_banks_data:
        print("No data to process. Exiting.")
    else:
        # List to hold DataFrames after sentiment analysis for all banks
        processed_dfs = []

        # List to hold aggregated sentiment results
        all_aggregated_sentiment_dfs = []

        for bank_name, df in all_banks_data.items():
            print(f"\nProcessing sentiment for {bank_name}...")

            if 'review' not in df.columns or df['review'].empty:
                print(
                    f"No 'review' column or empty reviews for {bank_name}. Skipping sentiment analysis.")
                continue

            # Convert reviews to a list for batch processing
            reviews_text = df['review'].tolist()

            # Perform sentiment analysis
            sentiment_labels, sentiment_scores = analyze_sentiment(
                reviews_text, tokenizer, model, device
            )

            # Add sentiment results to the DataFrame
            df['sentiment_label'] = sentiment_labels
            # This is the probability of the predicted label
            df['sentiment_score'] = sentiment_scores

            processed_dfs.append(df)
            print(f"Added sentiment to {bank_name}'s DataFrame.")

            # Aggregate sentiment for the current bank
            if not df.empty:
                bank_aggregated_sentiment = aggregate_sentiment(df)
                all_aggregated_sentiment_dfs.append(bank_aggregated_sentiment)
            else:
                print(
                    f"No reviews left in DataFrame for {bank_name} after sentiment analysis, skipping aggregation.")

            # Save the DataFrame with sentiment results for the current bank
            output_filename = f"{bank_name.replace(' ', '_').replace('/', '_')}_reviews_with_sentiment.csv"
            output_filepath = os.path.join(
                SENTIMENT_RESULTS_DIR, output_filename)
            try:
                df.to_csv(output_filepath, index=False, encoding='utf-8')
                print(
                    f"Successfully saved reviews with sentiment for {bank_name} to: {output_filepath}")
            except Exception as e:
                print(
                    f"Error saving sentiment data for {bank_name} to CSV: {e}")

        # Combine all aggregated sentiment results into a single DataFrame if there's data
        if all_aggregated_sentiment_dfs:
            final_aggregated_df = pd.concat(
                all_aggregated_sentiment_dfs).reset_index(drop=True)
            aggregated_output_filename = os.path.join(
                SENTIMENT_RESULTS_DIR, "aggregated_sentiment_by_bank_and_rating.csv")
            try:
                final_aggregated_df.to_csv(
                    aggregated_output_filename, index=False, encoding='utf-8')
                print(
                    f"\nSuccessfully saved aggregated sentiment results to: {aggregated_output_filename}")
                print("\nSample Aggregated Sentiment Data:")
                print(final_aggregated_df.head())
            except Exception as e:
                print(f"Error saving final aggregated sentiment data: {e}")
        else:
            print("\nNo aggregated sentiment data to save.")

    print("\nSentiment analysis pipeline complete.")

Starting sentiment analysis pipeline...
Loaded 400 reviews for 'Commercial Bank of Ethiopia Mobile' from Commercial_Bank_of_Ethiopia_Mobile_mobile_banking_reviews_cleaned.csv
Loaded 400 reviews for 'Dashen Bank Mobile' from Dashen_Bank_Mobile_mobile_banking_reviews_cleaned.csv
Loaded 400 reviews for 'Bank of Abyssinia Mobile' from Bank_of_Abyssinia_Mobile_mobile_banking_reviews_cleaned.csv

Processing sentiment for Commercial Bank of Ethiopia Mobile...
Analyzed sentiment for 400 reviews.
Added sentiment to Commercial Bank of Ethiopia Mobile's DataFrame.

--- Aggregating sentiment data ---
Sentiment aggregation complete.
Successfully saved reviews with sentiment for Commercial Bank of Ethiopia Mobile to: ../data/sentiment_analysis/Commercial_Bank_of_Ethiopia_Mobile_reviews_with_sentiment.csv

Processing sentiment for Dashen Bank Mobile...
Analyzed sentiment for 400 reviews.
Added sentiment to Dashen Bank Mobile's DataFrame.

--- Aggregating sentiment data ---
Sentiment aggregation compl