<a href="https://colab.research.google.com/github/kumpaten/masters-thesis-code/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U openai tiktoken pandas chardet

In [None]:
#@title Run GPT-4o on the 10-K (20-F) filings of each company and return sentiment scores for different aspects

from google.colab import userdata

import openai
import tiktoken
import os
import time
import pandas as pd
from openai import RateLimitError, APIError, OpenAIError

# Make sure Google Drive is mounted in Colab
# from google.colab import drive
# drive.mount('/content/drive')

# Configuration
env_key = userdata.get('API_KEY_OPENAI')
client = openai.OpenAI(api_key=env_key)
token_model = 'gpt-4o-mini'

# Intangible aspects to score
INTANGIBLE_ASPECTS = [
    "Leadership Quality",
    "Corporate Culture",
    "Customer Satisfaction Sentiment",
    "Innovation Sentiment",
    "Intellectual Property Strength",
    "Brand Strength",
    "Corporate Governance Quality"
]

# Directory containing one subfolder per company, each with cleaned .txt filings
BASE_DIR = '/content/drive/MyDrive/10K-20F-Filings-Cleaned'
companies = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]

# Prepare DataFrames
scores_df = pd.DataFrame(columns=['Company', 'Year'] + INTANGIBLE_ASPECTS)
justifications_df = pd.DataFrame(columns=['Company', 'Year', 'Aspect', 'Justification'])

# Function to split long text into chunks
def chunk_text(text, max_tokens=100000, model=token_model):
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    return [enc.decode(tokens[i:i + max_tokens])
            for i in range(0, len(tokens), max_tokens)]

# Function to call GPT for one chunk
def analyze_chunk(chunk, aspects, previous_scores=None, model=token_model):
    ref = f"\nPrevious year scores: {previous_scores}\n" if previous_scores else ''
    prompt = f"""
You are a financial analyst assessing a company's annual filing.
Rate the following intangible aspects from 0 (very negative) to 10 (very positive):
{', '.join(aspects)}
{ref}
Respond strictly with a markdown table:
| Aspect | Score (0-10) | Justification (max 2 sentences) |
|--------|--------------|---------------------------------|
---
{chunk}
---
"""
    for attempt in range(5):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )
            return resp.choices[0].message.content
        except (RateLimitError, APIError, OpenAIError) as e:
            print(f"Error (attempt {attempt+1}): {e}. Retrying in 60s...")
            time.sleep(60)
    raise RuntimeError("Failed after multiple retry attempts.")

# Function to parse GPT's markdown table output
def parse_results(markdown):
    data = {}
    for line in markdown.splitlines():
        if line.startswith('|') and 'Aspect' not in line and '---' not in line:
            cols = [c.strip() for c in line.strip('|').split('|')]
            if len(cols) == 3:
                aspect, score, just = cols
                data[aspect] = (float(score), just)
    return data

# Main loop processing all companies and years
for company in companies:
    comp_dir = os.path.join(BASE_DIR, company)
    files = sorted([f for f in os.listdir(comp_dir) if f.endswith('.txt')])
    previous = None

    for fname in files:
        year = os.path.splitext(fname)[0].split('_')[-1]
        print(f"Analyzing {company} ({year})...")
        filepath = os.path.join(comp_dir, fname)

        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()

        chunks = chunk_text(text)
        scores_accum = {asp: [] for asp in INTANGIBLE_ASPECTS}
        just_accum = {asp: [] for asp in INTANGIBLE_ASPECTS}

        for idx, chunk in enumerate(chunks, 1):
            print(f" Chunk {idx}/{len(chunks)}")
            md_output = analyze_chunk(chunk, INTANGIBLE_ASPECTS, previous)
            parsed = parse_results(md_output)
            for asp, (sc, just) in parsed.items():
                scores_accum[asp].append(sc)
                just_accum[asp].append(just)
            time.sleep(61)

        # Aggregate average scores
        avg_scores = {asp: round(sum(v)/len(v), 2) for asp, v in scores_accum.items()}
        row = {'Company': company, 'Year': year, **avg_scores}
        scores_df = pd.concat([scores_df, pd.DataFrame([row])], ignore_index=True)

        # Build justifications entries
        for asp, justs in just_accum.items():
            combined = ' '.join(justs)
            sents = combined.split('. ')
            summary = '. '.join(sents[:2]) + ('.' if len(sents) > 2 else '')
            just_row = {'Company': company, 'Year': year, 'Aspect': asp, 'Justification': summary}
            justifications_df = pd.concat([justifications_df, pd.DataFrame([just_row])], ignore_index=True)

        previous = avg_scores

        # Save interim results to Drive
        scores_df.to_csv('/content/drive/MyDrive/intangible_scores.csv', index=False)
        justifications_df.to_csv('/content/drive/MyDrive/intangible_justifications.csv', index=False)
        print("Saved interim results.")

print("All done!")

