In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 



In [29]:
signal_df = pd.read_csv('../data/signaltest1.csv')
sp500_df = pd.read_excel('../data/sp_500_constituents.xlsx')

In [30]:
signal_df.head()

Unnamed: 0,date,entity,classification,softmax_scores
0,2020-06-01 19:26:12+00:00,Ted Baker,NEUTRAL,"{'positive': 0.001163970329798758, 'negative':..."
1,2020-06-01 19:26:12+00:00,Rottweiler,NEUTRAL,"{'positive': 0.0001504293759353459, 'negative'..."
2,2020-06-01 19:26:12+00:00,Kelvin,NEGATIVE,"{'positive': 0.0002105771709466353, 'negative'..."
3,2020-06-01 19:26:12+00:00,Toscafund,NEUTRAL,"{'positive': 0.0004427890235092491, 'negative'..."
4,2020-06-01 19:26:12+00:00,Ugly Brown Building,NEUTRAL,"{'positive': 0.00026831135619431734, 'negative..."


In [31]:
sp500_df

Unnamed: 0,PERMNO,Company Name,Ticker,SP500 Start,SP500 End,Search Keywords
0,10104,ORACLE CORP,ORCL,"Aug. 3, 1989","Dec. 31, 2024","ORCL, Oracle, Oracle Corp"
1,10107,MICROSOFT CORP,MSFT,1994-06-07 00:00:00,"Dec. 31, 2024","MSFT, Microsoft, Microsoft Corp"
2,10138,T ROWE PRICE GROUP INC,TROW,"Oct. 13, 1999","Dec. 31, 2024","T Rowe Price, T Rowe Price Group Inc, TROW, T ..."
3,10145,HONEYWELL INTERNATIONAL INC,HON,"Dec. 31, 1925","Dec. 31, 2024","HON, Honeywell, Honeywell International Inc"
4,10516,ARCHER DANIELS MIDLAND CO,ADM,1981-07-30 00:00:00,"Dec. 31, 2024","ADM, Archer Daniels Midland, Archer Daniels Mi..."
...,...,...,...,...,...,...
540,85331,Axon Enterprise,AXON,2023-05-04 00:00:00,"Dec. 31, 2024","AXON, Axon Enterprise"
541,76043,Fair Isaac,FICO,2023-03-20 00:00:00,"Dec. 31, 2024","FICO, Fair Isaac"
542,88537,Bunge Global,BG,2023-03-15 00:00:00,"Dec. 31, 2024","BG, Bunge Global"
543,92083,Insulet,PODD,2023-03-15 00:00:00,"Dec. 31, 2024","Insulet, PODD"


In [22]:
'Facebook' in signal_df['entity']

False

In [20]:
from collections import Counter
entity_counts = Counter(signal_df['entity'])
entity_counts.most_common(10)


[('EU', 568),
 ('Guardian', 536),
 ('Labour', 520),
 ('Brexit', 461),
 ('Covid-19', 392),
 ('Treasury', 391),
 ('Conservative', 320),
 ('Tory', 296),
 ('Facebook', 288),
 ('Bank of England', 288)]

In [35]:
import re
import pandas as pd

# --- 1) Helpers --------------------------------------------------------------

# normalize names/tickers to make matching robust
def _norm(s: str) -> str:
    if pd.isna(s): 
        return ""
    s = s.lower()
    s = s.replace("&", " and ")
    # remove common company suffixes
    s = re.sub(r"\b(co|corp|corporation|inc|inc\.|ltd|plc|company|the)\b", " ", s)
    # keep only letters/digits/spaces
    s = re.sub(r"[^a-z0-9 ]+", " ", s)
    # collapse spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _split_keywords(s: str):
    if pd.isna(s) or not s:
        return []
    # keywords appear comma-separated (and sometimes with slashes/pipes/semicolons)
    return [k.strip() for k in re.split(r"[,\|/;]", s) if k.strip()]

# --- 2) Build a normalized keyword → ticker map from sp500_df ----------------

# Collect all aliases for each row: Ticker, Company Name, and Search Keywords
aliases_map = {}
for _, r in sp500_df.iterrows():
    ticker = str(r.get("Ticker", "")).strip()
    comps = [ticker, str(r.get("Company Name", ""))]
    comps += _split_keywords(str(r.get("Search Keywords", "")))
    # add both raw and normalized variants to the dictionary
    for alias in comps:
        if not alias:
            continue
        aliases_map[_norm(alias)] = ticker

# --- 3) Exact/alias match on normalized entity --------------------------------

signal_df = signal_df.copy()
signal_df["entity_norm"] = signal_df["entity"].map(_norm)
signal_df["matched_ticker"] = signal_df["entity_norm"].map(aliases_map)



In [42]:
signal_df = signal_df.drop_duplicates(subset=['matched_ticker', 'date'])

In [43]:
signal_df

Unnamed: 0,date,entity,classification,softmax_scores,entity_norm,matched_ticker
87,2020-06-01 18:02:51+00:00,DFS,NEUTRAL,"{'positive': 1.3598596524388995e-05, 'negative...",dfs,DFS
221,2020-06-01 19:13:31+00:00,Facebook,NEUTRAL,"{'positive': 7.744879258098081e-05, 'negative'...",facebook,META
263,2020-06-01 17:30:48+00:00,Whirlpool,NEUTRAL,"{'positive': 6.437378760892898e-05, 'negative'...",whirlpool,WHR
283,2020-06-01 13:32:08+00:00,American International Group,NEUTRAL,"{'positive': 1.2754047020280268e-05, 'negative...",american international group,AIG
288,2020-06-01 13:32:08+00:00,Chubb,NEUTRAL,"{'positive': 1.2503792277129833e-05, 'negative...",chubb,CB
...,...,...,...,...,...,...
44555,2020-12-17 14:10:20+00:00,American Airlines,NEUTRAL,"{'positive': 4.628747046808712e-05, 'negative'...",american airlines,AAL
44674,2020-12-29 18:55:51+00:00,Microsoft,NEUTRAL,"{'positive': 0.0003028673236258328, 'negative'...",microsoft,MSFT
44752,2020-12-09 18:59:54+00:00,Uber,NEUTRAL,"{'positive': 0.000121633245726116, 'negative':...",uber,UBER
44754,2020-12-09 18:59:54+00:00,Airbnb,NEUTRAL,"{'positive': 0.00011642227036645636, 'negative...",airbnb,ABNB


In [40]:
classification_counts = Counter(signal_df['classification'])
classification_counts.most_common(10)

[('NEUTRAL', 2287), ('NEGATIVE', 238), ('POSITIVE', 76)]