# Imports

In [None]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import re

# Data Settings
We sets up domain-specific text preprocessing for earnings call transcript analysis. It includes

## 1. Stopword lists
These are lists for filtering uninformative terms as well as terms that may give away too much information (bias).

---

  - `company_stopwords`             : Company names and their variants (e.g., "AMD", "META platforms inc", "TSMA").
  - `domain_stopwords`              : Common executive titles and conversational filler (e.g., "ceo", "thanks", "welcome").
  - `name_stopwords`                : First names of frequently mentioned executives or analysts in documents.
  - `month_stopwords`               : Full and abbreviated month names.
  - `transcript_stopwords`          : Common transcript platform or formatting terms.
  - `common_mid_sentence_stopwords` : Frequent low-value adverbs and intensifiers (e.g., "actually", "especially").


These are combined into `custom_stopwords`, which is passed into the `TfidfVectorizer` for vocabulary pruning.

## 2. Company_to_sector lists
These are a mapping from company display names to their primary industry sectors for downstream aggregation or grouping in analysis/visualization. We currently have 6 sectors

---

- Semiconductor
- Tech
- Finance
- Real Estate
- Airlines
- Consumer Goods

In [None]:
# Company names and their variations with LLC, Inc., Corp, etc.
company_stopwords = {
    # 3M
    "3m",
    # AMD
    "amd", "advanced micro devices", "amd inc",
    # American Airlines
    "americanairlines", "american airlines", "american airlines inc", "aal",
    # ASML
    "asml", "asml holding", "asml holding nv",
    # Blackstone
    "blackstone", "blackstone group", "blackstone inc",
    # CBRE
    "cbre", "cbre group", "cbre inc",
    # Citigroup
    "citigroup", "citi", "citigroup inc",
    # Cushman & Wakefield
    "cushmanwakefield", "cushman", "wakefield", "cushman wakefield", "cushman & wakefield", "cushman & wakefield plc",
    # Delta Airlines
    "delta", "delta airlines", "delta air lines", "delta air lines inc", "dal",
    # Google
    "google", "alphabet", "alphabet inc", "google llc",
    # Intel
    "intel", "intel corp", "intel corporation", "intel corp inc",
    # JPMorgan Chase
    "jpmc", "jpmorgan", "jpmorgan chase", "jp morgan", "jpmorgan chase & co", "jpmorgan chase & co inc", "chase",
    # Meta
    "meta", "facebook", "meta platforms", "meta platforms inc", "facebook inc",
    # Microsoft
    "microsoft", "msft", "microsoft corp", "microsoft corporation", "microsoft corp inc",
    # Netflix
    "netflix", "netflix inc",
    # Nvidia
    "nvidia", "nvda", "nvidia corp", "nvidia corporation", "nvidia corp inc",
    # Southwest Airlines
    "southwest", "southwest airlines", "southwest airlines co",
    # TSMC
    "tsmc", "taiwan semiconductor", "taiwan semiconductor manufacturing", "taiwan semiconductor manufacturing company limited",
    # Uber
    "uber", "uber technologies", "uber technologies inc",
    # United Airlines
    "united", "united airlines", "united airlines holdings", "united airlines holdings inc", "ual",
}

# Domain-specific: roles, filler, conversational
domain_stopwords = {
    # Roles
    "ceo", "president", "officer", "vp", "executive", "director", "analyst",
    "chairman", "coo", "cfo", "cto", "board", "partner", "manager",  
    
    # Filler / Conversational
    "thanks", "thank", "welcome", "yes", "okay", "just", "like", "right", 
    "think", "know", "sure", "guess", "guys", "let", "talk", "talking",
    "question", "answer", "comment", "comments", "operator", "hello", "hi",
    "today", "afternoon", "morning", "hey", "appreciate", "awesome", "llc",
    "inc", "corp", "company", "team", "everyone", "anyone", "anybody", "et cetera", "etc", "nii", "factset", "factset copyright", "consent logo", "cetera",
}

# Company-specific stopwords
name_stopwords = {
    "amy", "andrew", "barnum", "bob", "brett", "bruce", "dara", "david", "devon", "duane",
    "emma", "glen", "huang", "isom", "jamie", "jane", "jean", "jeff", "jensen", "jeremy",
    "jon", "jordan", "lisa", "su", "mark", "mason", "michael", "mike", "monish", "patolawala",
    "peter", "robert", "roger", "ryan", "satya", "scott", "spencer", "steve", "sundar",
    "tammy", "vasu", "wang", "wendell", "wennink", "william", "wei", "hauenstein", "morgan",
    "joe", "john", "james", "stephen", "bernstein", "brian", "brad", "bradley", "brian", "carl", "chris",
    "khosrowshahi", "philipp", "phil", "peter", "robert", "roger", "ryan", "sundar", "tammy",
    'dan', 'daniel', 'stanley', 'raymond', 'michelle', 'michael', 'mike', 'mary', 'pfennigwerth', 'fraser',
}

month_stopwords = {
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december",
    # Optional: add common abbreviations
    "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec"
}

transcript_stopwords = {
    "transcript", "chief", "marketintelligence", "com", "ve", "td", "ll", "et", "oq", "tw", "fourth quarter", "third quarter",
    "second quarter", "first quarter", "quarter", "quarterly", "year", "annual", "goldman", "goldman sachs", "cowen",
    "wells fargo", "fargo", "jefferies", "bank of america", "bofa", "deutsche bank", "db", "barclays",
    "bank", "baird", "bmo", "citi", "ubs", "wells", "month", "closing", "rights reserved", 'republication', "bank of america", "boa", "bank america",
    "rights reserved", "reserved redistribution", 'written consent', 'copyright global', 'copyright group', 'copyright',
}

common_mid_sentence_stopwords = {
    "absolutely", "actually", "essentially", "especially", "yeah", "yep", "yup", "uh", "um", "ah", "okay", "ok",
}

bad_words = {
    "didn", "doesn", "don", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn",
}

platform_stopwords = {
    "facebook", "instagram", "whatsapp", "reels", "metaverse", "messenger",
    "tiktok", "snapchat", "twitter", "x", "youtube", "linkedin", "android", "search", "chrome", "pixel", "adsense", "radeon", "epyc", "ryzen",
    "x86", "pentium", "core i7", "geforce", "cuda", 'rtx', 'gtx', "skyclub", "skymiles", "skyteams", 'euv', "morgan stanely", 'gemini', "llama",
    "airline", "airlines", "copilot", "copyright global", "american group", 'azure',
}

more_english_stopwords = {
    "a", "as", "able", "about", "above", "according", "accordingly",
	     "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow",
	     "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among",
	     "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway",
	     "anyways", "anywhere", "apart", "appear","appreciate", "appropriate", "are", "arent", "around",
	     "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became",
	     "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind",
	     "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond",
	     "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant",
	     "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come",
	     "comes", "concerning", "consequently", "consider", "considering", "contain", "containing",
	     "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely",
	     "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing",
	     "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either",
	     "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever",
	     "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example",
	     "except", "far", "few", "ff", "fifth", "first", "five", "followed", "following", "follows",
	     "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get",
	     "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten",
	     "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent",
	     "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter",
	     "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself",
	     "his", "hither", "hopefully", "how", "howbeit", "however", "i", "id", "ill", "im", "ive",
	     "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate",
	     "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is",
	     "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept",
	     "know", "knows", "known", "last", "lately", "later", "latter", "latterly", "least",
	     "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking",
	     "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely",
	     "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself",
	     "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither",
	     "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone",
	     "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of",
	     "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only",
	     "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves",
	     "out", "outside", "over", "overall", "own", "particular", "particularly",
	     "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably",
	     "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably",
	     "regarding", "regardless", "regards", "relatively", "respectively", "right", "said",
	     "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing",
	     "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent",
	     "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt",
	     "since", "six", "so", "some", "somebody", "somehow", "someone", "something",
	     "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify",
	     "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends",
	     "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs",
	     "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby",
	     "therefore", "therein", "theres", "thereupon", "these", "they", "theyd",
	     "theyll", "theyre", "theyve", "think", "third", "this", "thorough",
	     "thoroughly", "those", "though", "three", "through", "throughout", "thru",
	     "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries",
	     "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately",
	     "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used",
	     "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz",
	     "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve",
	     "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when",
	     "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby",
	     "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who",
	     "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish",
	     "with", "within", "without", "wont", "wonder", "would", "would", "wouldnt", "yes",
	     "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
	     "yourselves", "zero"
}

custom_stopwords = ENGLISH_STOP_WORDS.union(domain_stopwords) \
        .union(company_stopwords \
        .union(name_stopwords) \
        .union(month_stopwords) \
        .union(transcript_stopwords) \
        .union(common_mid_sentence_stopwords) \
        .union(platform_stopwords) \
        .union(more_english_stopwords) \
        .union(bad_words))

In [49]:
company_to_sector = {
    # Industrials
    "Delta": "Airlines",
    "SouthWest": "Airlines",
    "United": "Airlines",
    "AmericanAirlines": "Airlines",

    # Real Estate
    "CushmanWakefield": "RealEstate",
    "CBRE": "RealEstate",

    # Semiconductors
    "AMD": "Semiconductors",
    "ASML": "Semiconductors",
    "Intel": "Semiconductors",
    "Nvidia": "Semiconductors",
    "TSMC": "Semiconductors",

    # Tech
    "Microsoft": "Tech",
    "Google": "Tech",
    "Meta": "Tech",
    "Netflix": "Tech",
    "Uber": "Tech",

    # Finance
    "Blackstone": "Finance",
    "Citigroup": "Finance",
    "JPMC": "Finance",

    # Consumer Goods 
    "3M": "ConsumerGoods",
}


In [64]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove URLs, emails
    text = re.sub(r"http\S+|www\S+|\S+@\S+", "", text)
    
    # Remove legal boilerplate or repeated transcript phrases
    patterns = [
        r"\b(q[1-4]|fy\d{2}|eps|gaap|non[- ]?gaap|ebitda|na|nm)\b",
        r"\b(thank you|good morning|good afternoon|prior written consent|refinitiv|spglobal|callstreet)\b",
        r"\b\d+\b",  # numbers
    ]
    for p in patterns:
        text = re.sub(p, "", text, flags=re.IGNORECASE)

    return text

# Load Data

In [101]:
# 1. Load all .txt files and metadata
base_dir = "../data"
documents = []
companies = []
quarters = []

for company in os.listdir(base_dir):
    company_path = os.path.join(base_dir, company)
    if os.path.isdir(company_path):
        for file_path in glob.glob(f"{company_path}/*.txt"):
            with open(file_path, "r", encoding="latin1") as f:
                text = f.read()
            cleaned = clean_text(text)
            documents.append(cleaned)
            companies.append(company)
            quarters.append(os.path.basename(file_path).split('-')[0] + "-" + os.path.basename(file_path).split('-')[1])  # e.g., "Q1-2023"

# 2. TF-IDF Vectorization with ngrams
vectorizer = TfidfVectorizer(
    stop_words=list(custom_stopwords),
    ngram_range=(1, 2),
    max_features=1000,
    min_df=2,
    max_df=0.9,
    token_pattern=r'(?u)\b[a-zA-Z]{2,}\b'
)

tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_terms = vectorizer.get_feature_names_out()



In [None]:
svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

# 4. Save metadata and features
df = pd.DataFrame(svd_matrix)
df["company"] = companies
df["quarter"] = quarters

In [91]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_terms)
tfidf_df["company"] = companies
tfidf_df["quarter"] = quarters

# TODO
We need to figure out which words to filter out from these (like single letters, "LISA" - name for a CEO, etc)

In [102]:
print(tfidf_terms)

['ability' 'accelerate' 'accelerated' 'acceleration' 'access' 'account'
 'accounting' 'accuracy' 'achieve' 'achieved' 'acquisition' 'actions'
 'active' 'activities' 'activity' 'ad' 'add' 'added' 'adding' 'addition'
 'additional' 'address' 'adjusted' 'adjusted operating' 'adjustments'
 'adoption' 'ads' 'advanced' 'advantage' 'advertisers' 'advertising'
 'affiliated' 'affiliated companies' 'ago' 'agreement' 'ai' 'air'
 'air lines' 'aircraft' 'allocation' 'america' 'american' 'american group'
 'announced' 'anticipate' 'app' 'applicable' 'applications' 'approach'
 'approximately' 'approximately billion' 'approximately million' 'apps'
 'architecture' 'area' 'areas' 'asia' 'asset' 'assets' 'associates'
 'assume' 'assumptions' 'audience' 'automotive' 'average' 'baker'
 'balance' 'balance sheet' 'balances' 'banking' 'base' 'based current'
 'basically' 'basis' 'basis points' 'begin' 'beginning' 'benefit'
 'benefits' 'big' 'bigger' 'biggest' 'billion billion' 'boeing' 'book'
 'bookings' 'brand' 

In [88]:
tfidf_df.head()

Unnamed: 0,ability,able,accelerate,accelerated,acceleration,access,account,achieve,acquisition,actions,...,working,workloads,world,written,written consent,yeah,yield,york,company,quarter
0,0.004314,0.020861,0.0,0.005748,0.006721,0.006107,0.006777,0.0,0.0,0.231291,...,0.053877,0.0,0.0084,0.0,0.0,0.0,0.006453,0.0,3M,Q1-2023
1,0.004657,0.036033,0.005981,0.01241,0.007256,0.0,0.007317,0.012985,0.007137,0.080982,...,0.049215,0.0,0.009069,0.0,0.0,0.0,0.0,0.0,3M,Q1-2024
2,0.004744,0.022941,0.0,0.0,0.0,0.006716,0.0,0.0,0.0,0.130614,...,0.027346,0.0,0.0,0.0,0.0,0.0,0.014192,0.0,3M,Q2-2023
3,0.021074,0.020381,0.0,0.0,0.0,0.0,0.008277,0.0,0.008074,0.015268,...,0.045551,0.0,0.025646,0.0,0.0,0.007179,0.01576,0.0,3M,Q2-2024
4,0.014057,0.045314,0.006017,0.006243,0.0,0.006633,0.0,0.0,0.0,0.10863,...,0.031509,0.0,0.0,0.0,0.0,0.0,0.035042,0.0,3M,Q3-2023


In [11]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,company,quarter
0,0.386147,-0.089628,-0.02964,-0.232534,0.078826,-0.164527,0.18076,-0.068727,0.250279,-0.16735,...,-0.039736,-0.058058,-0.038541,-0.033137,0.054544,0.041494,0.027568,-0.010882,3M,Q1-2023
1,0.368425,-0.096676,-0.037689,-0.253953,0.088661,-0.174781,0.154138,-0.063419,0.24532,-0.156168,...,0.020505,0.004807,0.072851,0.02626,-0.03361,0.006023,-0.064832,0.0354,3M,Q1-2024
2,0.383213,-0.088868,-0.034083,-0.251979,0.103883,-0.180724,0.18135,-0.07221,0.244447,-0.171659,...,-0.043107,-0.040514,-0.040197,0.029044,0.022915,0.021052,-0.007279,-0.045466,3M,Q2-2023
3,0.459565,-0.074357,-0.056754,-0.241421,0.020082,-0.178733,0.122672,-0.063598,0.238666,-0.177607,...,0.111287,0.063363,-0.066542,0.037426,0.032765,-0.023974,-0.017382,0.0023,3M,Q2-2024
4,0.371547,-0.086633,-0.037332,-0.246429,0.074185,-0.176502,0.182363,-0.071818,0.257649,-0.174626,...,-0.06054,0.012191,0.005026,-0.030554,-0.037274,0.010453,-0.010907,0.000605,3M,Q3-2023


# Preprocessing