In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# FILE_PATH = '/content/drive/MyDrive/CryptoCurrency_comments_finbert_preprocessed_20250503_175536.csv'
FILE_PATH = '/content/drive/MyDrive/Bitcoin_comments_finbert_preprocessed_20250503_173153.csv'

Mounted at /content/drive


In [None]:
# config
TEXT_COLUMN = 'text_to_analyze'
OUTPUT_FILE_PATH = FILE_PATH.replace('.csv', '_finbert_analyzed.csv')
MODEL_NAME = "ProsusAI/finbert" # Standard FinBERT model fine-tuned for financial sentiment

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU. Processing might be slow.")

Using GPU: Tesla T4


In [None]:
# load tokenizer
print(f"Loading FinBERT model ({MODEL_NAME}) and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    model.to(device) # Move model to GPU if available
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    exit()

Loading FinBERT model (ProsusAI/finbert) and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model and tokenizer loaded successfully.


In [None]:
# Add this in a new cell RIGHT BEFORE your pd.read_csv cell
!ls -l /content/drive/MyDrive

total 3873459
-rw------- 1 root root 1697876297 May  3 15:32  Bitcoin_comments_finbert_preprocessed_20250503_173153.csv
-rw------- 1 root root  124653165 Apr 25 12:28  Bitcoin_submissions_finbert_preprocessed_20250425_142802.csv
-rw------- 1 root root  150161340 May  4 20:21  Bitcoin_submissions_finbert_preprocessed_20250425_142802_finbert_analyzed.csv
drwx------ 2 root root       4096 Oct 24  2024 'Colab Notebooks'
-rw------- 1 root root  817020183 May  3 15:55  CryptoCurrency_comments_finbert_preprocessed_20250503_175536.csv
-rw------- 1 root root  941341835 May  6 23:39  CryptoCurrency_comments_finbert_preprocessed_20250503_175536_finbert_analyzed.csv
-rw------- 1 root root  112123124 Apr 25 12:20  CryptoCurrency_submissions_finbert_preprocessed_20250425_142002.csv
-rw------- 1 root root  123230755 May  5 17:11  CryptoCurrency_submissions_finbert_preprocessed_20250425_142002_finbert_analyzed.csv
drwx------ 2 root root       4096 Sep  6  2016  Dokumente
drwx------ 2 root root       4

In [None]:
# load data
print(f"\nLoading data from: {FILE_PATH}")
try:

    try:
        df = pd.read_csv(FILE_PATH)
    except UnicodeDecodeError:
        print("UTF-8 decoding failed, trying latin-1...")
        df = pd.read_csv(FILE_PATH, encoding='latin-1')

    print(f"Successfully loaded {len(df)} rows.")
    print("DataFrame head:\n", df.head())

    if TEXT_COLUMN not in df.columns:
        print(f"\nError: Column '{TEXT_COLUMN}' not found in the CSV.")
        print("Available columns:", df.columns.tolist())
        exit()

    missing_text = df[TEXT_COLUMN].isnull().sum()
    if missing_text > 0:
        print(f"\nWarning: Found {missing_text} missing values in '{TEXT_COLUMN}'. These will be skipped.")
        # Optionally fill NaN values if needed, e.g., df[TEXT_COLUMN].fillna('', inplace=True)

except FileNotFoundError:
    print(f"Error: File not found at {FILE_PATH}")
    exit()
except Exception as e:
    print(f"Error loading or reading CSV: {e}")
    exit()


Loading data from: /content/drive/MyDrive/Bitcoin_comments_finbert_preprocessed_20250503_173153.csv
Successfully loaded 6336680 rows.
DataFrame head:
          author  score                                               link  \
0   u/andreasma      5  https://www.reddit.com/r/Bitcoin/comments/knch...   
1   u/[deleted]      1  https://www.reddit.com/r/Bitcoin/comments/km3g...   
2  u/IceCl4nHat      3  https://www.reddit.com/r/Bitcoin/comments/ko0a...   
3   u/[deleted]      1  https://www.reddit.com/r/Bitcoin/comments/km3g...   
4    u/zefy_zef      1  https://www.reddit.com/r/Bitcoin/comments/knmp...   

               created                                    text_to_analyze  
0  2021-01-01 01:00:00  i understand what youre saying perhaps i shoul...  
1  2021-01-01 01:00:00                                            deleted  
2  2021-01-01 01:00:00      they have electrum on ios too edit they don’t  
3  2021-01-01 01:01:00                                            removed  
4  20

In [None]:
# define SA function
def get_finbert_sentiment(text):
    """
    Analyzes the sentiment of a given text using the loaded FinBERT model.

    Args:
        text (str): The input text.

    Returns:
        tuple: (sentiment_label, prob_positive, prob_negative, prob_neutral)
               Returns ('no_text', 0.0, 0.0, 0.0) for invalid input.
               Returns ('error', 0.0, 0.0, 0.0) if analysis fails.
    """
    # Handle non-string or empty/NaN input
    if not isinstance(text, str) or pd.isna(text) or text.strip() == "":
        return 'no_text', 0.0, 0.0, 0.0

    try:
        # Tokenize text - Truncate long texts (BERT has a limit, often 512 tokens)
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to the same device as the model

        # Get predictions (run inference) !!
        with torch.no_grad(): # Disable gradient calculations for efficiency
            outputs = model(**inputs)

        # Process logits (raw model output) to get probabilities and prediction
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1).cpu() # Apply softmax and move back to CPU
        predicted_class_id = torch.argmax(probabilities, dim=-1).item()

        # Map prediction to label (FinBERT labels: 0: positive, 1: negative, 2: neutral)
        labels = ['positive', 'negative', 'neutral']
        sentiment_label = labels[predicted_class_id]

        # Get probabilities for each class
        prob_positive = probabilities[0][0].item()
        prob_negative = probabilities[0][1].item()
        prob_neutral = probabilities[0][2].item()

        return sentiment_label, prob_positive, prob_negative, prob_neutral

    except Exception as e:
        # print(f"Error processing text: '{text[:50]}...' - {e}") # Uncomment for debugging
        return 'error', 0.0, 0.0, 0.0

In [9]:
# apply SA
print(f"\nStarting FinBERT sentiment analysis on the '{TEXT_COLUMN}' column...")
print(f"Processing {len(df)} rows. This may take a while (especially on CPU)...")

start_time = time.time()

# Apply the function row by row.
# Note: For very large datasets (> millions of rows), consider batch processing
# or libraries like pandarallel/Dask for speedup. For 150MB, this should be okay.
results = df[TEXT_COLUMN].apply(get_finbert_sentiment)

# --- Add Results to DataFrame ---
# Create new columns from the tuple returned by the function
df[['finbert_sentiment', 'finbert_prob_positive', 'finbert_prob_negative', 'finbert_prob_neutral']] = pd.DataFrame(results.tolist(), index=df.index)

end_time = time.time()
print(f"\nSentiment analysis complete in {end_time - start_time:.2f} seconds.")
print("\nDataFrame head with new sentiment columns:\n", df.head())

# Display sentiment distribution
print("\nSentiment Distribution:")
print(df['finbert_sentiment'].value_counts())

# --- Save Results ---
print(f"\nSaving results to: {OUTPUT_FILE_PATH}")
try:
    df.to_csv(OUTPUT_FILE_PATH, index=False, encoding='utf-8-sig')
    print("Results saved successfully.")
except Exception as e:
    print(f"Error saving results to CSV: {e}")

print("\nScript finished.")


Starting FinBERT sentiment analysis on the 'text_to_analyze' column...
Processing 6336680 rows. This may take a while (especially on CPU)...

Sentiment analysis complete in 54140.26 seconds.

DataFrame head with new sentiment columns:
          author  score                                               link  \
0   u/andreasma      5  https://www.reddit.com/r/Bitcoin/comments/knch...   
1   u/[deleted]      1  https://www.reddit.com/r/Bitcoin/comments/km3g...   
2  u/IceCl4nHat      3  https://www.reddit.com/r/Bitcoin/comments/ko0a...   
3   u/[deleted]      1  https://www.reddit.com/r/Bitcoin/comments/km3g...   
4    u/zefy_zef      1  https://www.reddit.com/r/Bitcoin/comments/knmp...   

               created                                    text_to_analyze  \
0  2021-01-01 01:00:00  i understand what youre saying perhaps i shoul...   
1  2021-01-01 01:00:00                                            deleted   
2  2021-01-01 01:00:00      they have electrum on ios too edit they d