In [21]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F # To apply softmax for probabilities
import time

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# Correct path including 'Colab Notebooks'
FILE_PATH = '/content/drive/MyDrive/Bitcoin_submissions_finbert_preprocessed_20250425_142802.csv'

# Adjust the output path as well:
OUTPUT_FILE_PATH = '/content/drive/MyDrive/Bitcoin_submissions_finbert_preprocessed_20250425_142802_finbert_analyzed.csv'

In [27]:
# config
TEXT_COLUMN = 'text_to_analyze'
OUTPUT_FILE_PATH = FILE_PATH.replace('.csv', '_finbert_analyzed.csv')
MODEL_NAME = "ProsusAI/finbert" # Standard FinBERT model fine-tuned for financial sentiment

In [28]:
# check GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU. Processing might be slow.")


Using GPU: Tesla T4


In [31]:
# load tokenizer
print(f"Loading FinBERT model ({MODEL_NAME}) and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    model.to(device) # Move model to GPU if available
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    exit()

Loading FinBERT model (ProsusAI/finbert) and tokenizer...
Model and tokenizer loaded successfully.


In [30]:
# Add this in a new cell RIGHT BEFORE your pd.read_csv cell
!ls -l /content/drive/MyDrive

total 121745
-rw------- 1 root root 124653165 Apr 25 12:28  Bitcoin_submissions_finbert_preprocessed_20250425_142802.csv
drwx------ 3 root root      4096 Oct 24  2024 'Colab Notebooks'
drwx------ 2 root root      4096 Sep  6  2016  Dokumente
drwx------ 2 root root      4096 Sep 19  2022  GoodNotes
-rw------- 1 root root       172 Apr  7 19:21  thesheet.gsheet


In [32]:
# load data
print(f"\nLoading data from: {FILE_PATH}")
try:
    # Try detecting encoding, common ones are utf-8 and latin-1
    try:
        df = pd.read_csv(FILE_PATH)
    except UnicodeDecodeError:
        print("UTF-8 decoding failed, trying latin-1...")
        df = pd.read_csv(FILE_PATH, encoding='latin-1')

    print(f"Successfully loaded {len(df)} rows.")
    print("DataFrame head:\n", df.head())

    # Verify the text column exists
    if TEXT_COLUMN not in df.columns:
        print(f"\nError: Column '{TEXT_COLUMN}' not found in the CSV.")
        print("Available columns:", df.columns.tolist())
        exit()
    # Optional: Check for missing values in the text column
    missing_text = df[TEXT_COLUMN].isnull().sum()
    if missing_text > 0:
        print(f"\nWarning: Found {missing_text} missing values in '{TEXT_COLUMN}'. These will be skipped.")
        # Optionally fill NaN values if needed, e.g., df[TEXT_COLUMN].fillna('', inplace=True)

except FileNotFoundError:
    print(f"Error: File not found at {FILE_PATH}")
    exit()
except Exception as e:
    print(f"Error loading or reading CSV: {e}")
    exit()


Loading data from: /content/drive/MyDrive/Bitcoin_submissions_finbert_preprocessed_20250425_142802.csv
Successfully loaded 384151 rows.
DataFrame head:
          author  score                                               link  \
0   u/[deleted]      0  https://www.reddit.com/r/Bitcoin/comments/ko10...   
1  u/randum-guy      0  https://www.reddit.com/r/Bitcoin/comments/ko12...   
2    u/Mari0805    119  https://www.reddit.com/r/Bitcoin/comments/ko15...   
3   u/[deleted]      0  https://www.reddit.com/r/Bitcoin/comments/ko17...   
4   u/[deleted]      1  https://www.reddit.com/r/Bitcoin/comments/ko18...   

               created                                    text_to_analyze  
0  2021-01-01 01:00:00                first time saved made money deleted  
1  2021-01-01 01:02:00  btc dip to 20k is it possible for bitcoin to d...  
2  2021-01-01 01:07:00  btc just had the monthly and yearly close 2020...  
3  2021-01-01 01:10:00                       i believe in bitcoin deleted  
4  

In [33]:
# --- Define Sentiment Analysis Function ---
def get_finbert_sentiment(text):
    """
    Analyzes the sentiment of a given text using the loaded FinBERT model.

    Args:
        text (str): The input text.

    Returns:
        tuple: (sentiment_label, prob_positive, prob_negative, prob_neutral)
               Returns ('no_text', 0.0, 0.0, 0.0) for invalid input.
               Returns ('error', 0.0, 0.0, 0.0) if analysis fails.
    """
    # Handle non-string or empty/NaN input
    if not isinstance(text, str) or pd.isna(text) or text.strip() == "":
        return 'no_text', 0.0, 0.0, 0.0

    try:
        # Tokenize text - Truncate long texts (BERT has a limit, often 512 tokens)
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to the same device as the model

        # Get predictions (run inference)
        with torch.no_grad(): # Disable gradient calculations for efficiency
            outputs = model(**inputs)

        # Process logits (raw model output) to get probabilities and prediction
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1).cpu() # Apply softmax and move back to CPU
        predicted_class_id = torch.argmax(probabilities, dim=-1).item()

        # Map prediction to label (FinBERT labels: 0: positive, 1: negative, 2: neutral)
        labels = ['positive', 'negative', 'neutral']
        sentiment_label = labels[predicted_class_id]

        # Get probabilities for each class
        prob_positive = probabilities[0][0].item()
        prob_negative = probabilities[0][1].item()
        prob_neutral = probabilities[0][2].item()

        return sentiment_label, prob_positive, prob_negative, prob_neutral

    except Exception as e:
        # print(f"Error processing text: '{text[:50]}...' - {e}") # Uncomment for debugging
        return 'error', 0.0, 0.0, 0.0

In [34]:
# --- Apply Sentiment Analysis ---
print(f"\nStarting FinBERT sentiment analysis on the '{TEXT_COLUMN}' column...")
print(f"Processing {len(df)} rows. This may take a while (especially on CPU)...")

start_time = time.time()

# Apply the function row by row.
# Note: For very large datasets (> millions of rows), consider batch processing
# or libraries like pandarallel/Dask for speedup. For 150MB, this should be okay.
results = df[TEXT_COLUMN].apply(get_finbert_sentiment)

# --- Add Results to DataFrame ---
# Create new columns from the tuple returned by the function
df[['finbert_sentiment', 'finbert_prob_positive', 'finbert_prob_negative', 'finbert_prob_neutral']] = pd.DataFrame(results.tolist(), index=df.index)

end_time = time.time()
print(f"\nSentiment analysis complete in {end_time - start_time:.2f} seconds.")
print("\nDataFrame head with new sentiment columns:\n", df.head())

# Display sentiment distribution
print("\nSentiment Distribution:")
print(df['finbert_sentiment'].value_counts())

# --- Save Results ---
print(f"\nSaving results to: {OUTPUT_FILE_PATH}")
try:
    df.to_csv(OUTPUT_FILE_PATH, index=False, encoding='utf-8-sig') # Use utf-8-sig for better Excel compatibility
    print("Results saved successfully.")
except Exception as e:
    print(f"Error saving results to CSV: {e}")

print("\nScript finished.")


Starting FinBERT sentiment analysis on the 'text_to_analyze' column...
Processing 384151 rows. This may take a while (especially on CPU)...

Sentiment analysis complete in 3556.20 seconds.

DataFrame head with new sentiment columns:
          author  score                                               link  \
0   u/[deleted]      0  https://www.reddit.com/r/Bitcoin/comments/ko10...   
1  u/randum-guy      0  https://www.reddit.com/r/Bitcoin/comments/ko12...   
2    u/Mari0805    119  https://www.reddit.com/r/Bitcoin/comments/ko15...   
3   u/[deleted]      0  https://www.reddit.com/r/Bitcoin/comments/ko17...   
4   u/[deleted]      1  https://www.reddit.com/r/Bitcoin/comments/ko18...   

               created                                    text_to_analyze  \
0  2021-01-01 01:00:00                first time saved made money deleted   
1  2021-01-01 01:02:00  btc dip to 20k is it possible for bitcoin to d...   
2  2021-01-01 01:07:00  btc just had the monthly and yearly close 2020.