In [10]:
# Install the transformers library from Hugging Face
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import numpy as np
import os


In [2]:
# Downlaod and load the fine-tuned FinBERT sentiment classification model and the vocabulary (FinVocab, the finance vocabulary from Huang et al) from Hugging Face
finbert_model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [3]:
# Add additional required imports
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
import torch

In [16]:
# Download nltk punkt for sentence tokenization 
nltk.download('punkt', download_dir=r'D:\nltk_data')
nltk.data.path.append(r'D:\nltk_data')

[nltk_data] Downloading package punkt to D:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [19]:
nltk.download('punkt_tab', download_dir=r'D:\nltk_data')
nltk.data.path.append(r'D:\nltk_data')

[nltk_data] Downloading package punkt_tab to D:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [20]:
# Test the tokenization to verify that 'punkt' is available
test_text = "hello world. this is a test!"
sentences = sent_tokenize(test_text)
print(sentences)

['hello world.', 'this is a test!']


In [21]:
# Create a sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer)

Device set to use cpu


#### Define Helper Functions

In [22]:
def map_sentiment_to_score(label:str) -> int:
    """
    Map a sentiment label to a numeric score.
    Scores:
    - Neutral -> 0
    - Positive -> 1
    - Negative -> 2
    """
    label = label.lower()
    if "neutral" in label:
        return 0
    elif "positive" in label:
        return 1
    elif "negative" in label:
        return 2
    else:
        return 0 # default to neutral if label is unrecognized
    
def compute_avg_sentiment(text: str) -> float:
    """
    Splits the text into sentences using NLTK, applies sentiment analysis using FinBERT, and returns the average sentiment score for the text.
    
    Args:
        text (str): The input text from the componenttext column
    
    Returns:
        float: The average sentiment score
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0 #If the text is empty or not a string, return 0.
    
    # Split text into sentences
    sentences = sent_tokenize(text)
    if not sentences:
        return 0.0
    
    # Batch all sentences at once for efficiency
    results = sentiment_pipeline(sentences)
    
    # Map each sentence's sentiment to its corresponding score
    scores = [map_sentiment_to_score(result['label']) for result in results]
    
    # Return the average score
    avg_score = sum(scores) / len(scores)
    return avg_score


#### Process all CSV files in the specified directory

In [23]:
# Set Input working directory
input_directory = r'D:/wrdsTables/ciqtranscriptcomponent_chunks'

# Set output directory for processed CSV files
output_directory = r'D:/wrdsTables/ciqtranscriptcomponent_chunks_scored'
os.makedirs(output_directory, exist_ok=True) # create the folder if it does not exist

In [26]:
# Use glob to list all CSV files in the directory
import glob

csv_files = glob.glob(os.path.join(input_directory, '*.csv'))
print(f"Found {len(csv_files)} CSV files.")

for file in csv_files:
    print(f"\nProcessing file: {file}")
    try:
        # Read the csv file into a dataframe
        df = pd.read_csv(file)
    except Exception as e:
        print(f"Error reading file {file}: {e}")
        continue
    
    # Check that the expected "componenttext" column exists
    if 'componenttext' not in df.columns:
        print(f"Column 'componenttext' not found in file {file}. Skipping file.")
        continue
        
    # Apply the compute_avg_sentiment function to the 'componenttext' column
    df['avg_sentiment'] = df['componenttext'].apply(compute_avg_sentiment)
    
    # Get the transcriptid fromt he first row (assumes all rows share the same transcriptid)
    try:
        transcript_id = str(df['transcriptid'].iloc[0])
    except Exception as e:
        print(f"Error retrieving transcriptid from file {file}: {e}")
        continue
    
    # Create the output file name as '{transcriptid}_scored.csv' in the output directory
    output_file = os.path.join(output_directory, f"{transcript_id}_scored.csv")
    
    try:
        df.to_csv(output_file, index=False)
        print(f"Saved updated file: {output_file}")
    except Exception as e:
        print(f"Error saving file {output_file}: {e}")
    

Found 1007396 CSV files.

Processing file: D:/wrdsTables/ciqtranscriptcomponent_chunks\756002.csv
Saved updated file: D:/wrdsTables/ciqtranscriptcomponent_chunks_scored\756002_scored.csv

Processing file: D:/wrdsTables/ciqtranscriptcomponent_chunks\503514.csv


KeyboardInterrupt: 