In [1]:
import os, time, requests, pandas as pd
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta
import re, html
from typing import Iterable, Dict, List
import pandas as pd
import numpy as np

In [2]:
MAX_CHAR = 2000
WINDOW = 1

In [3]:
news = pd.read_csv('../data/guardian_financial_news_master.csv')
news['article_id'] = news.index
news

Unnamed: 0,version https://git-lfs.github.com/spec/v1,url,title,pub_date,summary,body,article_id
0,size 371154095,,,,,,0
1,,https://www.theguardian.com/technology/2017/fe...,Facebook's Oculus must pay $500m in virtual re...,2017-02-01T22:36:43Z,Jury determined that VR company’s CEO Palmer L...,The Facebook-owned company Oculus has been ord...,1
2,,https://www.theguardian.com/politics/blog/live...,MPs vote to give May power to trigger article ...,2017-02-01T21:21:40Z,Rolling coverage of the day’s political develo...,MPs have taken a historic step towards taking ...,2
3,,https://www.theguardian.com/business/live/2017...,"Federal Reserve leaves interest rates on hold,...",2017-02-01T21:07:12Z,US central bank resists raising interest rate...,"And finally, Wall Street has closed a little h...",3
4,,https://www.theguardian.com/business/nils-prat...,TalkTalk hoping for rapid results as Dunstone ...,2017-02-01T19:29:32Z,Investors should welcome Sir Charles’s decisio...,You can understand why Sir Charles Dunstone mi...,4
...,...,...,...,...,...,...,...
16479,,https://www.theguardian.com/politics/2019/dec/...,Coogan and Klein lead cultural figures backing...,2019-12-03T17:07:00Z,"Actors, writers and musicians praise Labour fo...",A host of leading cultural figures including t...,16479
16480,,https://www.theguardian.com/politics/2019/dec/...,Queen’s speech: disability pledge and renters’...,2019-12-22T18:14:10Z,<strong>Letters:</strong> Lib Dem peer <strong...,One Conservative party manifesto commitment th...,16480
16481,,https://www.theguardian.com/politics/2019/dec/...,"Polling day weather to be wet and cold, say fo...",2019-12-11T09:53:29Z,Met Office predicts showers across much of UK ...,The UK’s first December general election in al...,16481
16482,,https://www.theguardian.com/politics/2019/dec/...,Quiz: could you be an election returning officer?,2019-12-04T12:00:36Z,Find out how much you know about the mechanics...,You’ve stayed up late watching every election ...,16482


In [4]:
entities = pd.read_excel('../data/sp_500_constituents.xlsx')

TICKER_MAP = {}
for _, row in entities.iterrows():
    ticker = row['Ticker']
    name = row['Search Keywords']    
    TICKER_MAP[ticker] = [t.strip() for t in name.split(',') if t.strip()]

TICKER_MAP

{'ORCL': ['ORCL', 'Oracle', 'Oracle Corp'],
 'MSFT': ['MSFT', 'Microsoft', 'Microsoft Corp'],
 'TROW': ['T Rowe Price', 'T Rowe Price Group Inc', 'TROW', 'T Rowe'],
 'HON': ['HON', 'Honeywell', 'Honeywell International Inc'],
 'ADM': ['ADM',
  'Archer Daniels Midland',
  'Archer Daniels Midland Co',
  'Archer Daniels'],
 'FISV': ['FISV', 'Fiserv', 'Fiserv Inc'],
 'KO': ['Coca Cola', 'Cocala', 'KO', 'Coke'],
 'CDNS': ['CDNS',
  'Cadence Design',
  'Cadence Design Systems',
  'Cadence Design Systems Inc'],
 'ED': ['Consolidated Edison', 'Consolidated Edison Inc', 'ED'],
 'XRAY': ['Dentsply Sirona', 'Dentsply Sirona Inc', 'XRAY'],
 'FAST': ['FAST', 'Fastenal', 'Fastenal Co'],
 'DTE': ['DTE Energy', 'DTE Energy Co', 'DTE'],
 'ETN': ['ETN', 'Eaton', 'Eaton Corp', 'Eaton Corp Plc'],
 'SIVB': ['SVB Financial',
  'SVB Financial Group',
  'SIVB',
  'Silicon Valley Bank'],
 'XOM': ['Exxon Mobil', 'Exxon Mobil Corp', 'XOM'],
 'MGM': ['MGM Resorts', 'MGM Resorts International', 'MGM'],
 'WM': ['WM

In [5]:
import nltk
nltk.download('punkt')

def split_sentences(text: str):
    return nltk.sent_tokenize(text)

[nltk_data] Downloading package punkt to /Users/yiruoli/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# ---- Drop-in: extract sentences mentioning each company from your `articles` DF ----
def _strip_html(x: str) -> str:
    x = html.unescape(x or "")
    return re.sub(r"<[^>]+>", " ", x)

# simple sentence splitter that works well on newsy English
_SENT_SPLIT = re.compile(r'(?<!\b[A-Z])(?<=[.!?])\s+(?=[A-Z0-9])')

def _compile_alias_regex(aliases: Iterable[str]) -> re.Pattern:
    al = []
    for a in aliases:
        a = a.strip()
        if not a: 
            continue
        a = r"\s+".join(map(re.escape, a.split()))
        al.append(fr"\b{a}\b(?:'s|’s)?")
    return re.compile("|".join(al), flags=re.IGNORECASE) if al else re.compile(r"$^")


def _mark_alias_in_sentence(sent: str, aliases: Iterable[str]) -> str:
    if not isinstance(sent, str) or not sent.strip():
        return sent

    original = sent
    for a in aliases:
        a = a.strip()
        if not a:
            continue
        patt = r"\b" + r"\s+".join(map(re.escape, a.split())) + r"\b(?:'s|’s)?"
        regex = re.compile(patt, flags=re.IGNORECASE)
        if regex.search(original):
            return regex.sub(" [TGT] ", original, count=1)

    return original

def extract_company_sentences_from_text(
    text: str,
    aliases: Iterable[str],
    window: int,
    max_len: int
) -> List[str]:
    text = _strip_html(text)

    sents = re.split(r'(?<=[.!?])\s+', text)
    sents = [s for s in sents if s.strip()] 

    pat = _compile_alias_regex(aliases)

    hits_idx = [i for i, s in enumerate(sents) if pat.search(s)]
    chunks = []

    for i in hits_idx:
        lo, hi = max(0, i - window), min(len(sents), i + window + 1)

        window_sents = sents[lo:hi]

        center_rel_idx = i - lo
        center_sent = window_sents[center_rel_idx]
        center_marked = _mark_alias_in_sentence(center_sent, aliases)
        window_sents = list(window_sents)
        window_sents[center_rel_idx] = center_marked

        chunk = " ".join(window_sents).strip()
        chunks.append(chunk[:max_len])

    return chunks



def get_company_mentions(
    articles: pd.DataFrame,
    alias_map: Dict[str, Iterable[str]],
    text_col: str = "body",      # or "summary" / "title"
    window: int = 0, 
    max_len: int = 500, 
) -> pd.DataFrame:
    rows = []
    for tic, aliases in alias_map.items():
        subset = articles.loc[articles["ticker"] == tic]
        for idx, r in subset.iterrows():
            original_idx = r["article_id"]
            text = r.get(text_col, "") or ""
            for snip in extract_company_sentences_from_text(
                text,
                aliases,
                window=window,
                max_len=max_len
            ):
                rows.append({
                    "ticker": tic,
                    "pub_date": r.get("pub_date"),
                    "article_id": r.get("id"),
                    "title": r.get("title"),
                    "url": r.get("url"),
                    "snippet": snip,
                    "article_row": original_idx,   
                })
    return pd.DataFrame(rows)


In [None]:
news2 = news[:50].copy()
news2["full_text"] = (
    news2["title"].fillna("").astype(str).str.replace("nan", "", regex=False)
    + " "
    + news2["body"].fillna("").astype(str).str.replace("nan", "", regex=False)
)

compiled_patterns = {
    ticker: _compile_alias_regex(aliases)
    for ticker, aliases in TICKER_MAP.items()
}

expanded_dfs = []

for ticker, pat in compiled_patterns.items():
    mask = news2["full_text"].str.contains(pat)
    sub = news2.loc[mask].copy()
    if sub.empty:
        continue

    sub["ticker"] = ticker
    expanded_dfs.append(sub)

news3_with_tickers = (
    pd.concat(expanded_dfs, ignore_index=True)
    .drop(columns=["full_text"])
)
news3_with_tickers

Unnamed: 0,version https://git-lfs.github.com/spec/v1,url,title,pub_date,summary,body,article_id,ticker
0,,https://www.theguardian.com/technology/2017/ja...,#DeleteUber: how tech companies are taking sid...,2017-01-31T11:00:20Z,With ride-hailing services a focal point amid ...,For the average ride-hail user in a major city...,27,MSFT
1,,https://www.theguardian.com/technology/2017/ja...,Amazon pledges legal support to action against...,2017-01-31T09:56:50Z,CEO Jeff Bezos says company’s legal and lobbyi...,"Amazon chief executive, Jeff Bezos, has pledge...",28,MSFT
2,,https://www.theguardian.com/business/2017/jan/...,"Starbucks vows to hire 10,000 refugees as US c...",2017-01-30T10:23:07Z,Coffee chain unveils plan to hire staff as top...,"Starbucks has promised to hire 10,000 refugees...",39,MSFT
3,,https://www.theguardian.com/politics/blog/live...,Brexit: MPs debate article 50 bill - as it hap...,2017-02-01T00:18:03Z,Rolling coverage of the day’s political develo...,The debate has now ended for the night after n...,16,HON
4,,https://www.theguardian.com/politics/blog/live...,MPs vote to give May power to trigger article ...,2017-02-01T21:21:40Z,Rolling coverage of the day’s political develo...,MPs have taken a historic step towards taking ...,2,ED
...,...,...,...,...,...,...,...,...
572,,https://www.theguardian.com/technology/2017/fe...,#DeleteUber: company automates account removal...,2017-02-01T11:35:06Z,"Taxi company, perceived to be pro-Trump, accus...",So many people have been deleting their Uber a...,14,UBER
573,,https://www.theguardian.com/business/2017/jan/...,Guarantee minimum wage for gig economy workers...,2017-01-31T15:59:30Z,Labour MP wants government to set up national ...,Theresa May should guarantee the minimum wage ...,24,UBER
574,,https://www.theguardian.com/technology/2017/ja...,#DeleteUber: how tech companies are taking sid...,2017-01-31T11:00:20Z,With ride-hailing services a focal point amid ...,For the average ride-hail user in a major city...,27,UBER
575,,https://www.theguardian.com/technology/2017/ja...,#DeleteUber: how tech companies are taking sid...,2017-01-31T11:00:20Z,With ride-hailing services a focal point amid ...,For the average ride-hail user in a major city...,27,ABNB


In [20]:
mentions = get_company_mentions(news3_with_tickers, TICKER_MAP, text_col="body", window=WINDOW, max_len = MAX_CHAR)
mentions

Unnamed: 0,ticker,pub_date,article_id,title,url,snippet,article_row
0,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"The floodgates opened on Friday, with a cautio...",27
1,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,“There’s an understandable tendency to speak a...,27
2,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,“What I want us as the tech workers and users ...,27
3,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"Companies such as Google, Facebook, Amazon and...",27
4,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"Indeed, Google, Facebook, Amazon and Microsoft...",27
...,...,...,...,...,...,...,...
4809,UBER,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"Indeed, Google, Facebook, Amazon and Microsoft...",27
4810,ABNB,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"The floodgates opened on Friday, with a cautio...",27
4811,ABNB,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"Indeed, Google, Facebook, Amazon and Microsoft...",27
4812,ABNB,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"The Guardian asked Facebook, Google, Amazon, M...",27


In [21]:
mentions.to_csv('../data/chopped_data.csv')

In [39]:
mentions["approx_tokens"] = mentions["snippet"].apply(lambda x: len(str(x).split()))
np.mean(mentions["approx_tokens"])

136.71082009446116

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class FinABSALonger:

    def __init__(
        self,
        ckpt_path: str = "amphora/FinABSA-Longer",
        max_input_length: int = 1024,
        max_gen_length: int = 32,
        num_beams: int = 4,
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
        self.max_input_length = max_input_length
        self.max_gen_length = max_gen_length
        self.num_beams = num_beams

    @staticmethod
    def _extract_label_from_output(text: str):
        if not isinstance(text, str):
            return None
        up = text.upper()
        if "POSITIVE" in up:
            return "POSITIVE"
        if "NEGATIVE" in up:
            return "POSITIVE"
        if "NEUTRAL" in up:
            return "NEUTRAL"
        return None

    def analyze(self, text: str):
        if not isinstance(text, str) or not text.strip():
            return {
                "label": "neutral",
                "raw_output": None,
            }

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_input_length,
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            gen_ids = self.model.generate(
                **inputs,
                max_length=self.max_gen_length,
                num_beams=self.num_beams,
            )

        decoded = self.tokenizer.decode(gen_ids[0], skip_special_tokens=True)
        label = self._extract_label_from_output(decoded)

        if label is None:
            label = "neutral"

        return {
            "label": label,
        }


In [None]:
from tqdm.auto import tqdm

absa = FinABSALonger()

tqdm.pandas(desc="FinABSA-Longer")

def run_finabsa(text: str):
    return absa.analyze(text)

cur = mentions[:50]
cur["absa"] = cur["snippet"].progress_apply(run_finabsa)
cur

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


FinABSA-Longer:   0%|          | 0/50 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cur["absa"] = cur["snippet"].progress_apply(run_finabsa)


Unnamed: 0,ticker,pub_date,article_id,title,url,snippet,article_row,absa
0,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"The floodgates opened on Friday, with a cautio...",27,"{'label': 'neutral', 'raw_output': 'The sentim..."
1,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,“There’s an understandable tendency to speak a...,27,"{'label': 'neutral', 'raw_output': 'The sentim..."
2,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,“What I want us as the tech workers and users ...,27,"{'label': 'neutral', 'raw_output': 'The sentim..."
3,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"Companies such as Google, Facebook, Amazon and...",27,"{'label': 'positive', 'raw_output': 'The senti..."
4,MSFT,2017-01-31T11:00:20Z,,#DeleteUber: how tech companies are taking sid...,https://www.theguardian.com/technology/2017/ja...,"Indeed, Google, Facebook, Amazon and Microsoft...",27,"{'label': 'neutral', 'raw_output': 'The sentim..."
5,MSFT,2017-01-31T09:56:50Z,,Amazon pledges legal support to action against...,https://www.theguardian.com/technology/2017/ja...,"These are our roots, this is our soul. All era...",28,"{'label': 'neutral', 'raw_output': 'The sentim..."
6,MSFT,2017-01-30T10:23:07Z,,"Starbucks vows to hire 10,000 refugees as US c...",https://www.theguardian.com/business/2017/jan/...,"Starbucks has promised to hire 10,000 refugees...",39,"{'label': 'neutral', 'raw_output': 'The sentim..."
7,MSFT,2017-01-30T10:23:07Z,,"Starbucks vows to hire 10,000 refugees as US c...",https://www.theguardian.com/business/2017/jan/...,Technology firms were the first to come out pu...,39,"{'label': 'neutral', 'raw_output': 'The sentim..."
8,MSFT,2017-01-30T10:23:07Z,,"Starbucks vows to hire 10,000 refugees as US c...",https://www.theguardian.com/business/2017/jan/...,"“As an immigrant and as a CEO, I’ve both exper...",39,"{'label': 'neutral', 'raw_output': 'The sentim..."
9,MSFT,2017-01-30T10:23:07Z,,"Starbucks vows to hire 10,000 refugees as US c...",https://www.theguardian.com/business/2017/jan/...,"Microsoft’s president, Brad Smith, said 76 emp...",39,"{'label': 'neutral', 'raw_output': 'The sentim..."


In [24]:
cur.to_csv('cur.csv')