In [None]:
import os, time, requests, pandas as pd
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta
import re, html
from typing import Iterable, Dict, List
import numpy as np
import ast

In [None]:
MAX_CHAR = 2000
WINDOW = 1

In [None]:
news = pd.read_csv('../data/guardian_financial_news_master.csv')
news['article_id'] = news.index
news

Unnamed: 0,version https://git-lfs.github.com/spec/v1,url,title,pub_date,summary,body,article_id
0,size 371154095,,,,,,0
1,,https://www.theguardian.com/technology/2017/fe...,Facebook's Oculus must pay $500m in virtual re...,2017-02-01T22:36:43Z,Jury determined that VR company’s CEO Palmer L...,The Facebook-owned company Oculus has been ord...,1
2,,https://www.theguardian.com/politics/blog/live...,MPs vote to give May power to trigger article ...,2017-02-01T21:21:40Z,Rolling coverage of the day’s political develo...,MPs have taken a historic step towards taking ...,2
3,,https://www.theguardian.com/business/live/2017...,"Federal Reserve leaves interest rates on hold,...",2017-02-01T21:07:12Z,US central bank resists raising interest rate...,"And finally, Wall Street has closed a little h...",3
4,,https://www.theguardian.com/business/nils-prat...,TalkTalk hoping for rapid results as Dunstone ...,2017-02-01T19:29:32Z,Investors should welcome Sir Charles’s decisio...,You can understand why Sir Charles Dunstone mi...,4
...,...,...,...,...,...,...,...
32981,,https://www.theguardian.com/world/2022/dec/02/...,EU states agree $60 a barrel cap on Russian oi...,2022-12-02T18:09:34Z,"Poland, which was pushing for low cap, says de...",European Union member states have agreed to pu...,32981
32982,,https://www.theguardian.com/business/2022/dec/...,"Supermarkets don’t create jobs, they destroy t...",2022-12-08T17:34:21Z,Letter: <strong>Alistair Herbert </strong>says...,You report that Asda “is planning to open 300 ...,32982
32983,,https://www.theguardian.com/business/2022/dec/...,Insurers end war-risk cover for shipping in Uk...,2022-12-28T16:58:35Z,Move comes as a result of reinsurers exiting t...,Ship insurers are cancelling war-risk coverage...,32983
32984,,https://www.theguardian.com/politics/2022/dec/...,A treatment-based approach to tackling hard dr...,2022-12-22T17:14:33Z,Letters:<strong> Barry Coppinger </strong>writ...,Simon Jenkins (Even England’s police want to d...,32984


In [None]:
entities = pd.read_excel('../data/sp_500_constituents.xlsx')

TICKER_MAP = {}
for _, row in entities.iterrows():
    ticker = row['Ticker']
    name = row['Search Keywords']    
    TICKER_MAP[ticker] = [t.strip() for t in name.split(',') if t.strip()]

TICKER_MAP

{'ORCL': ['ORCL', 'Oracle', 'Oracle Corp'],
 'MSFT': ['MSFT', 'Microsoft', 'Microsoft Corp'],
 'TROW': ['T Rowe Price', 'T Rowe Price Group Inc', 'TROW', 'T Rowe'],
 'HON': ['HON', 'Honeywell', 'Honeywell International Inc'],
 'ADM': ['ADM',
  'Archer Daniels Midland',
  'Archer Daniels Midland Co',
  'Archer Daniels'],
 'FISV': ['FISV', 'Fiserv', 'Fiserv Inc'],
 'KO': ['Coca Cola', 'Cocala', 'KO', 'Coke'],
 'CDNS': ['CDNS',
  'Cadence Design',
  'Cadence Design Systems',
  'Cadence Design Systems Inc'],
 'ED': ['Consolidated Edison', 'Consolidated Edison Inc', 'ED'],
 'XRAY': ['Dentsply Sirona', 'Dentsply Sirona Inc', 'XRAY'],
 'FAST': ['FAST', 'Fastenal', 'Fastenal Co'],
 'DTE': ['DTE Energy', 'DTE Energy Co', 'DTE'],
 'ETN': ['ETN', 'Eaton', 'Eaton Corp', 'Eaton Corp Plc'],
 'SIVB': ['SVB Financial',
  'SVB Financial Group',
  'SIVB',
  'Silicon Valley Bank'],
 'XOM': ['Exxon Mobil', 'Exxon Mobil Corp', 'XOM'],
 'MGM': ['MGM Resorts', 'MGM Resorts International', 'MGM'],
 'WM': ['WM

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yiruoli/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def split_sentences(text: str):
    return nltk.sent_tokenize(text)

In [None]:
# ---- Drop-in: extract sentences mentioning each company from your `articles` DF ----
def _strip_html(x: str) -> str:
    x = html.unescape(x or "")
    return re.sub(r"<[^>]+>", " ", x)

# simple sentence splitter that works well on newsy English
_SENT_SPLIT = re.compile(r'(?<!\b[A-Z])(?<=[.!?])\s+(?=[A-Z0-9])')

def _compile_alias_regex(aliases: Iterable[str]) -> re.Pattern:
    al = []
    for a in aliases:
        a = a.strip()
        if not a: 
            continue
        a = r"\s+".join(map(re.escape, a.split()))
        al.append(fr"\b{a}\b(?:'s|’s)?")
    return re.compile("|".join(al), flags=re.IGNORECASE) if al else re.compile(r"$^")


def _mark_alias_in_sentence(sent: str, aliases: Iterable[str]) -> str:
    if not isinstance(sent, str) or not sent.strip():
        return sent

    original = sent
    for a in aliases:
        a = a.strip()
        if not a:
            continue
        patt = r"\b" + r"\s+".join(map(re.escape, a.split())) + r"\b(?:'s|’s)?"
        regex = re.compile(patt, flags=re.IGNORECASE)
        if regex.search(original):
            return regex.sub(" [TGT] ", original, count=1)

    return original

def extract_company_sentences_from_text(
    text: str,
    aliases: Iterable[str],
    window: int,
    max_len: int
) -> List[str]:
    text = _strip_html(text)

    sents = re.split(r'(?<=[.!?])\s+', text)
    sents = [s for s in sents if s.strip()] 

    pat = _compile_alias_regex(aliases)

    hits_idx = [i for i, s in enumerate(sents) if pat.search(s)]
    chunks = []

    for i in hits_idx:
        lo, hi = max(0, i - window), min(len(sents), i + window + 1)

        window_sents = sents[lo:hi]

        center_rel_idx = i - lo
        center_sent = window_sents[center_rel_idx]
        center_marked = _mark_alias_in_sentence(center_sent, aliases)
        window_sents = list(window_sents)
        window_sents[center_rel_idx] = center_marked

        chunk = " ".join(window_sents).strip()
        chunks.append(chunk[:max_len])

    return chunks



def get_company_mentions(
    articles: pd.DataFrame,
    alias_map: Dict[str, Iterable[str]],
    text_col: str = "body",      # or "summary" / "title"
    window: int = 0, 
    max_len: int = 500, 
) -> pd.DataFrame:
    rows = []
    for tic, aliases in alias_map.items():
        subset = articles.loc[articles["ticker"] == tic]
        for idx, r in subset.iterrows():
            original_idx = r["article_id"]
            text = r.get(text_col, "") or ""
            for snip in extract_company_sentences_from_text(
                text,
                aliases,
                window=window,
                max_len=max_len
            ):
                rows.append({
                    "ticker": tic,
                    "pub_date": r.get("pub_date"),
                    "article_id": r.get("id"),
                    "title": r.get("title"),
                    "url": r.get("url"),
                    "snippet": snip,
                    "article_row": original_idx,   
                })
    return pd.DataFrame(rows)


In [None]:
news2 = news.copy()
news2["full_text"] = (
    news2["title"].fillna("").astype(str).str.replace("nan", "", regex=False)
    + " "
    + news2["body"].fillna("").astype(str).str.replace("nan", "", regex=False)
)

compiled_patterns = {
    ticker: _compile_alias_regex(aliases)
    for ticker, aliases in TICKER_MAP.items()
}

expanded_dfs = []

for ticker, pat in compiled_patterns.items():
    mask = news2["full_text"].str.contains(pat)
    sub = news2.loc[mask].copy()
    if sub.empty:
        continue

    sub["ticker"] = ticker
    expanded_dfs.append(sub)

news3_with_tickers = (
    pd.concat(expanded_dfs, ignore_index=True)
    .drop(columns=["full_text"])
)
news3_with_tickers

Unnamed: 0,version https://git-lfs.github.com/spec/v1,url,title,pub_date,summary,body,article_id,ticker
0,,https://www.theguardian.com/global-development...,World's eight richest people have same wealth ...,2017-01-16T00:01:08Z,A new report by Oxfam warns of the growing and...,The world’s eight richest billionaires control...,193,ORCL
1,,https://www.theguardian.com/technology/2017/fe...,Meet the rightwing power players lurking benea...,2017-02-10T10:00:49Z,Despite promoting an image of innovative icono...,When a group of 97 technology companies filed ...,712,ORCL
2,,https://www.theguardian.com/business/2017/mar/...,Has the tech bubble peaked? Signs that the sta...,2017-03-17T07:02:02Z,Startups are beginning to run out of money and...,If you were looking for an apartment in the Ba...,1082,ORCL
3,,https://www.theguardian.com/technology/2017/ap...,Palantir to pay $1.7m over accusation it discr...,2017-04-26T19:17:39Z,Government lawsuit against the huge Silicon Va...,"Palantir, a Silicon Valley company with ties t...",1465,ORCL
4,,https://www.theguardian.com/technology/2017/ap...,Google accused of 'extreme' gender pay discrim...,2017-04-07T22:48:42Z,Allegations of possible employment violations ...,Google has discriminated against its female em...,1653,ORCL
...,...,...,...,...,...,...,...,...
361179,,https://www.theguardian.com/business/2022/sep/...,Could Wael Sawan usher in a renewable revoluti...,2022-09-15T15:29:14Z,Campaigners hope to see a radical shift under ...,Ben van Beurden sat stony faced as climate act...,31098,BG
361180,,https://www.theguardian.com/business/2022/sep/...,Chris O’Shea: Centrica chief executive with a ...,2022-09-10T15:00:06Z,The energy boss is committed to nuclear and re...,Even for an executive in the eye of the gather...,31132,BG
361181,,https://www.theguardian.com/business/2022/sep/...,Shell boss Ben van Beurden prepares to stand d...,2022-09-02T08:18:45Z,Energy firm shortlists four internal candidate...,"Shell’s long-serving chief executive, Ben van ...",31234,BG
361182,,https://www.theguardian.com/business/2022/sep/...,Shell appoints Wael Sawan to replace outgoing ...,2022-09-15T11:51:39Z,Energy firm’s head of integrated gas and renew...,"Shell has appointed Wael Sawan, a 25-year comp...",31281,BG


In [None]:
news3_with_tickers = news3_with_tickers.dropna(subset=["body"]).copy()

In [None]:
mentions = get_company_mentions(news3_with_tickers, TICKER_MAP, text_col="body", window=WINDOW, max_len = MAX_CHAR)
mentions

Unnamed: 0,ticker,pub_date,article_id,title,url,snippet,article_row
0,ORCL,2017-01-16T00:01:08Z,,World's eight richest people have same wealth ...,https://www.theguardian.com/global-development...,Oxfam said the world’s poorest 50% owned the s...,193
1,ORCL,2017-02-10T10:00:49Z,,Meet the rightwing power players lurking benea...,https://www.theguardian.com/technology/2017/fe...,Rodgers and McNealy combined the standard smal...,712
2,ORCL,2017-03-17T07:02:02Z,,Has the tech bubble peaked? Signs that the sta...,https://www.theguardian.com/business/2017/mar/...,When you’ve seen your income go down by 80% as...,1082
3,ORCL,2017-04-26T19:17:39Z,,Palantir to pay $1.7m over accusation it discr...,https://www.theguardian.com/technology/2017/ap...,"Trump’s chief strategist, Steve Bannon, has pr...",1465
4,ORCL,2017-04-26T19:17:39Z,,Palantir to pay $1.7m over accusation it discr...,https://www.theguardian.com/technology/2017/ap...,"An official with the department, which is seek...",1465
...,...,...,...,...,...,...,...
2228866,BG,2022-09-10T15:00:06Z,,Chris O’Shea: Centrica chief executive with a ...,https://www.theguardian.com/business/2022/sep/...,"Born in Kirkcaldy, Fife, O’Shea studied accoun...",31132
2228867,BG,2022-09-02T08:18:45Z,,Shell boss Ben van Beurden prepares to stand d...,https://www.theguardian.com/business/2022/sep/...,His departure would end a near-40-year career ...,31234
2228868,BG,2022-09-15T11:51:39Z,,Shell appoints Wael Sawan to replace outgoing ...,https://www.theguardian.com/business/2022/sep/...,“Ben can look back with great pride on an extr...,31281
2228869,PODD,2021-08-22T12:58:33Z,,PCR Covid test firm with links to former minis...,https://www.theguardian.com/business/2021/aug/...,"“I’ve called them three or four times, whereup...",25015


In [None]:
BAD_TICKERS = {"A", "ON", "IT", "HAS", "ARE",
               "ALL", "NOW", "SEE", "WELL", "COST",
               "RE", "V", "C", "F", "TECH"}  

#unsure: PM, D, O, FAST, PEAK, SO, LOW, KEY

mentions_clean = mentions[~mentions["ticker"].isin(BAD_TICKERS)].copy()

ticker_counts_clean = (
    mentions_clean["ticker"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "ticker", "ticker": "count"})
)

ticker_counts_clean

### Here I ran the FinABSA.py on GPU, after getting the result, process the following for the final score

In [None]:
df = pd.read_csv('../data/top_50_absa_2024.csv')

In [None]:
def to_dict(x):
    if isinstance(x, dict):
        return x
    try:
        return ast.literal_eval(x)
    except:
        return None

def normalize_label(prob_dict):
    if not isinstance(prob_dict, dict) or len(prob_dict) == 0:
        return None
    return max(prob_dict, key=prob_dict.get)

df["absa_prob_dict"] = df["absa_probs"].apply(to_dict)
df["label_clean"] = df["absa_prob_dict"].apply(normalize_label)

This function computes different scores in several methods to choose from for later strategy generation 

In [None]:
def extract_probs(d):
    if not isinstance(d, dict):
        return pd.Series(
            {"POS_prob": np.nan, "NEU_prob": np.nan, "NEG_prob": np.nan}
        )
    return pd.Series(
        {
            "POS_prob": d.get("POSITIVE", 0.0),
            "NEU_prob": d.get("NEUTRAL", 0.0),
            "NEG_prob": d.get("NEGATIVE", 0.0),
        }
    )

def get_group_date(group):
    vals = group["date"].dropna().unique()
    if len(vals) == 0:
        return None
    if len(vals) == 1 and not isinstance(vals[0], (list, pd.Series, np.ndarray)):
        return vals[0]

    v = group["date"].iloc[0]
    if isinstance(v, (pd.Series, list, np.ndarray)):
        if len(v) == 0:
            return None
        return v[0]
    else:
        return v

def _implied_from_dict(d):
    if d is None:
        return None
    if any(v is None for v in d.values()):
        return None
    key = max(d, key=d.get)   # 'positive' / 'neutral' / 'negative'
    return key.upper()        # 'POSITIVE' / 'NEUTRAL' / 'NEGATIVE'

def analyze_score_article(df):
    results = []

    for (article_id, ticker), group in df.groupby(["article_row", "ticker"]):
        g = group.copy()

        g[["POS_prob", "NEU_prob", "NEG_prob"]] = \
            g["absa_prob_dict"].apply(extract_probs)

        confidences = g[["POS_prob", "NEU_prob", "NEG_prob"]].max(axis=1)

        if confidences.isna().all() or confidences.sum() == 0:
            weighted_pos = weighted_neu = weighted_neg = np.nan
        else:
            weighted_pos = (g["POS_prob"] * confidences).sum() / confidences.sum()
            weighted_neu = (g["NEU_prob"] * confidences).sum() / confidences.sum()
            weighted_neg = (g["NEG_prob"] * confidences).sum() / confidences.sum()

        if np.isnan(weighted_pos) or np.isnan(weighted_neg):
            weighted_score = np.nan
        else:
            weighted_score = weighted_pos - weighted_neg

        weighted_softmax = {
            "positive": float(weighted_pos) if not np.isnan(weighted_pos) else None,
            "neutral":  float(weighted_neu) if not np.isnan(weighted_neu) else None,
            "negative": float(weighted_neg) if not np.isnan(weighted_neg) else None,
        }

        simple_pos = g["POS_prob"].mean()
        simple_neu = g["NEU_prob"].mean()
        simple_neg = g["NEG_prob"].mean()

        softmax_simple = {
            "positive": float(simple_pos) if not np.isnan(simple_pos) else None,
            "neutral":  float(simple_neu) if not np.isnan(simple_neu) else None,
            "negative": float(simple_neg) if not np.isnan(simple_neg) else None,
        }

        labels = (
            g["label_clean"]
            .dropna()
            .astype(str)
            .str.upper()
        )
        labels = labels[labels.isin(["POSITIVE", "NEUTRAL", "NEGATIVE"])]

        if labels.empty:
            vc = pd.Series(
                {"POSITIVE": 0, "NEUTRAL": 0, "NEGATIVE": 0}
            )
        else:
            vc = labels.value_counts()
            vc = vc.reindex(["POSITIVE", "NEUTRAL", "NEGATIVE"], fill_value=0)

        label_counts = vc.to_dict()
        majority_label = max(label_counts, key=label_counts.get)

        if pd.isna(weighted_score):
            score_implied_label = None
        elif weighted_score > 0.15:
            score_implied_label = "POSITIVE"
        elif weighted_score < -0.15:
            score_implied_label = "NEGATIVE"
        else:
            score_implied_label = "NEUTRAL"

        weighted_implied_label = _implied_from_dict(weighted_softmax)
        implied_label = _implied_from_dict(softmax_simple)

        if score_implied_label is None or majority_label is None:
            is_conflict = False
        else:
            is_conflict = (score_implied_label != majority_label)

        results.append({
            "date": get_group_date(g),
            "article_id": article_id,
            "ticker": ticker,

            "weighted_score": weighted_score,
            "majority_label": majority_label,           # snippet most votes
            "score_implied_label": score_implied_label, #  weighted_score 
            "weighted_implied_label": weighted_implied_label,  #  weighted_softmax
            "implied_label": implied_label,                    #  simple softmax
            "is_conflict": is_conflict,

            "POS_count": label_counts["POSITIVE"],
            "NEU_count": label_counts["NEUTRAL"],
            "NEG_count": label_counts["NEGATIVE"],
            "total_snippets": len(g),

            "weighted_softmax": weighted_softmax,
            "softmax": softmax_simple,
        })

    out = pd.DataFrame(results)
    return out


In [None]:
df['date'] = pd.to_datetime(df['date']).dt.date 
consistency_df = analyze_score_article(df)
consistency_df

In [None]:
def _softmax_to_series(d):
    if not isinstance(d, dict):
        return pd.Series({"pos_prob": np.nan, "neu_prob": np.nan, "neg_prob": np.nan})
    
    dd = {str(k).lower(): v for k, v in d.items()}
    return pd.Series({
        "pos_prob": dd.get("positive", np.nan),
        "neu_prob": dd.get("neutral",  np.nan),
        "neg_prob": dd.get("negative", np.nan),
    })

def _implied_from_probs(pos, neu, neg):
    if np.isnan(pos) or np.isnan(neu) or np.isnan(neg):
        return None
    
    scores = {
        "POSITIVE": pos,
        "NEUTRAL":  neu,
        "NEGATIVE": neg,
    }
    return max(scores, key=scores.get)

def aggregate_daily_ticker(df_article, softmax):
    df = df_article.copy()

    probs = df[softmax].apply(_softmax_to_series)
    df = pd.concat([df, probs], axis=1)

    grouped = df.groupby(["date", "ticker"], as_index=False).agg(
        pos_prob_mean=("pos_prob", "mean"),
        neu_prob_mean=("neu_prob", "mean"),
        neg_prob_mean=("neg_prob", "mean"),
        article_count=(softmax, "size")
    )

    def _make_daily_softmax(row):
        return {
            "positive": float(row["pos_prob_mean"]),
            "neutral":  float(row["neu_prob_mean"]),
            "negative": float(row["neg_prob_mean"]),
        }

    grouped["daily_softmax"] = grouped.apply(_make_daily_softmax, axis=1)

    grouped["classification"] = grouped.apply(
        lambda r: _implied_from_probs(
            r["pos_prob_mean"], r["neu_prob_mean"], r["neg_prob_mean"]
        ),
        axis=1
    )

    out = grouped[[
        "date",
        "ticker",
        "classification",
        "daily_softmax",
        "article_count",
    ]]

    return out


In [None]:
daily = aggregate_daily_ticker(consistency_df, 'softmax')
daily.rename(columns={"ticker": "entity"}, inplace=True)
daily.rename(columns={"daily_softmax": "softmax"}, inplace=True)

In [None]:
counts = daily["classification"].value_counts(dropna=False)
print("\nRaw counts:")
print(counts)

ordered = counts.reindex(["POSITIVE", "NEGATIVE", "NEUTRAL"])
print("\nOrdered counts (POS / NEG / NEU):")
print(ordered)


Raw counts:
classification
NEUTRAL     37242
NEGATIVE     4429
POSITIVE     2710
Name: count, dtype: int64

Ordered counts (POS / NEG / NEU):
classification
POSITIVE     2710
NEGATIVE     4429
NEUTRAL     37242
Name: count, dtype: int64


In [None]:
daily.to_csv('../data/daily_absa_2022.csv')