In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import pipeline
import pandas as pd
import os

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="cpu")


def classify_title(title: str):
    output = classifier(
        title,
        candidate_labels=["Negative", "Positive", "Neutral"],
    )

    result = {"title": output["sequence"]}
    for i, label in enumerate(output["labels"]):
        result[label.lower()] = output["scores"][i]
    return result


def save_classified_titles(ticker: str, executor: ThreadPoolExecutor, *, index: int, length: int):
    if os.path.exists(f"sentiments/{ticker}.csv"):
        return

    df = pd.read_csv(f"merged_articles/{ticker}.csv")

    sampled_df = df.sample(n=100)
    sampled_titles = sampled_df["title"].tolist()

    results = []

    # with ThreadPoolExecutor(max_workers=24) as executor:
    futures = [executor.submit(classify_title, t) for t in sampled_titles]
    for future in as_completed(futures):
        result = future.result()
        results.append(result)

    # for title in sampled_titles:
    #     results.append(classify_title(title))

    os.makedirs("sentiments", exist_ok=True)

    sentiment_df = pd.DataFrame(results)
    sentiment_df.to_csv(f"sentiments/{ticker}.csv")

    print(f"({index + 1:3d}/{length}) SAVED: {ticker}")

In [None]:
sp500 = pd.read_csv("sp500.csv")
tickers = sorted([ticker for ticker in sp500["Symbol"]])


executor = ThreadPoolExecutor(max_workers=24)

for i, ticker in enumerate(tickers[472:]):
    save_classified_titles(ticker, executor, index=i, length=len(tickers[472:]))

executor.shutdown()