In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import scipy.special
import torch
import gradio as gr
import pandas as pd
import re
import PyPDF2 
import numpy as np
import matplotlib.pyplot as plt
from experta import *

# Load the stopwords
with open("stopwords.txt", "r") as f:
    stopwords = f.read().split("\n")[:-1]

# Load the Loughran-McDonald dictionary
lm_dict = pd.read_csv("Loughran-McDonald_MasterDictionary_1993-2023.csv")
pos_words = lm_dict[lm_dict["Positive"] != 0]["Word"].str.lower().to_list()
neg_words = lm_dict[lm_dict["Negative"] != 0]["Word"].str.lower().to_list()

# Load FinBert model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

def preprocess_text(text):
    # Lowercase the text
    words = text.split()
    words = [w.lower() for w in words]
    
    # Remove stopwords
    words = [w for w in words if w not in stopwords]
    
    # Remove punctuation and numbers
    words = [w for w in words if w.isalpha()]
    
    return " ".join(words)

def calculate_sentiment(text):
    words = text.split()
    n_pos = len([w for w in words if w in pos_words])
    n_neg = len([w for w in words if w in neg_words])
    n_total = len(words)
    
    # Print the number of positive and negative words
    print(f"Positive words count: {n_pos}")
    print(f"Negative words count: {n_neg}")
    print(f"Total words count: {n_total}")
    if n_total == 0:
        print("No words to analyze. Sentiment is neutral by default.")
        return 0, 0, "neutral"

    # Calculate Loughran-McDonald Scores
    lm_score1 = (n_pos - n_neg) / n_total
    lm_score2 = (n_pos - n_neg) / (n_pos + n_neg) if (n_pos + n_neg) != 0 else 0

    print(f"Loughran-McDonald Score 1: {lm_score1:.4f}")
    print(f"Loughran-McDonald Score 2: {lm_score2:.4f}")
    cutoff=-0.4
    # Determine sentiment based on lm_score2 and a customizable cutoff
    if lm_score2 > cutoff:
        sentiment = "positive"
    elif lm_score2 < -cutoff:
        sentiment = "negative"
    else:
        sentiment = "neutral"

    print(f"Final Sentiment: {sentiment.capitalize()}")

    return lm_score1, lm_score2, sentiment
    
#     return lm_score1, lm_score2, sentiment

def get_finbert_sentiment(text: str) -> tuple[float, float, float, str]:
    with torch.no_grad():
        inputs = tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=512
        )
        outputs = model(**inputs)
        logits = outputs.logits
        scores = {
            k: v
            for k, v in zip(
                model.config.id2label.values(),
                scipy.special.softmax(logits.numpy().squeeze()),
            )
        }
        return (
            scores["positive"],
            scores["negative"],
            scores["neutral"],
            max(scores, key=scores.get),
        )

  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


In [2]:

# Example interpretation functions
def interpret_beneish_score(results):
    if results['M-Score'] < -1.78:
        return "Depends on Beneish M Score Indicator, this analysis indicates a low probability of earnings manipulation for the company."
    elif -1.78 <= results['M-Score'] <= -1.22:
        return "Depends on Beneish M Score Indicator, this analysis indicates a moderate probability of earnings manipulation for the company."
    else:
        return "Depends on Beneish M Score Indicator, this analysis suggests a high probability of earnings manipulation for the company."

def interpret_piotroski_score(results):
    score = results.get('PiotroskiFScore', 0)
    common = ("\n"
        "Net Income: Indicates if the company is profitable.\n"
        "Return on Assets (ROA): Reflects the efficiency of asset use.\n"
        "Cash Flow from Operations (CFO): Shows operational cash generation.\n"
        "Accrual Accounting: Compares CFO to Net Income.\n"
        "Leverage: Lower leverage is preferred compared to the previous year.\n"
        "Liquidity: Higher current ratio compared to the previous year is favorable.\n"
        "Dilution: No increase in shares outstanding indicates better equity health.\n"
        "Gross Margin: Indicates cost control relative to revenue.\n"
        "Asset Turnover: Higher turnover indicates better asset utilization.\n"
    )
    if score >= 8:
        return "Depends on Piotroski Indicator, The company shows a strong financial position.\n" + common
    elif 5 <= score < 8:
        return "Depends on Piotroski Indicator, The company has a moderate financial position.\n" + common
    else:
        return "Depends on Piotroski Indicator, The company is in a weak financial position.\n" + common

def interpret_springate_score(results):
    score = results.get('SpringateScore', 0)
    if score > 0.862:
        return "Depends on Springate Indicator, The company is in a stable state."
    else:
        return "Depends on Springate Indicator, The company might be under financial stress."

def interpret_sentiment(lm_sentiment, finbert_sentiment):
    sentiments = {
        'positive': ("The overall sentiment is positive, suggesting a favorable financial outlook.", "green"),
        'negative': ("The overall sentiment is negative, indicating potential concerns.", "red"),
        'neutral': ("The overall sentiment is neutral, implying a stable outlook.", "gray")
    }
    
    lm_interpretation, lm_color = sentiments.get(lm_sentiment.lower(), ("Unclear", "black"))
    finbert_interpretation, finbert_color = sentiments.get(finbert_sentiment.lower(), ("Unclear", "black"))
    
    return f"""
    <p style="color: {lm_color};">Loughran-McDonald Sentiment: {lm_interpretation}</p>
    <p style="color: {finbert_color};">FinBERT Sentiment: {finbert_interpretation}</p>
    """

