In [92]:
import os
os.sys.path.append('../')
from glob import glob
from urllib.parse import urlparse
import pandas as pd
import requests
from bs4 import BeautifulSoup
from threading import Thread, Lock
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
import os
from tqdm import tqdm
import time
import numpy as np
from src.vllm import batch_call_llm_chat, llm_chat, message_template
from src.utils import load_json, chunk_documents
import outlines
from outlines.integrations.vllm import JSONLogitsProcessor, RegexLogitsProcessor
from openai import OpenAI
import threading
import queue
import logging
import requests
import json
from transformers import AutoTokenizer
import concurrent.futures
import json

In [93]:
model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)


ticker = "AAPL"
ticker_paths = sorted(glob("/data/kai/forecasting/data/raw_v0.2/*.csv"))
summary_dir = "/data/kai/forecasting/data/summary_v0.2"
document_dir = "/data/kai/forecasting/data/document_v0.2"
os.makedirs(summary_dir, exist_ok=True)
os.makedirs(document_dir, exist_ok=True)

ticker = ticker_paths[0].split("/")[-1].split('.csv')[0]
df = pd.read_csv(ticker_paths[0])
df = df.drop(columns=["Unnamed: 0"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [63]:
def process_document(prompt, doc, guided_json=None):
    messages = message_template(prompt, doc)
    response = llm_chat(messages, guided_json=guided_json)
    data = json.loads(response)

    return data

def combine_results(results):
    results_dict = {
        'key_numbers': [],
        'growth_trends': [],
        'overall_market_outlook': [],
        'major_stock_movements': [],
        'significant_economic_indicators': [],
        'notable_company_specific_news': [],
        'summary': []
    }

    # Append each value to the appropriate list in results_dict
    for result in results:
        for key in result:
            if isinstance(result[key], list):  # For lists of dictionaries (like 'key_numbers')
                results_dict[key].extend(result[key])
            else:
                results_dict[key].append(result[key])
    return results_dict


In [64]:
# Test Case
prompt = """You are a helpful assistant for converting raw text of a\
stock news website into relevant text information. Summarize the following raw text by following the guideline:
1. Filter out irrelevant information unrelated to stock news.
2. Provide a concise summary that includes key numbers, growth trends, and the overall market outlook.
3. Mention major stock movements, significant economic indicators, and any notable company-specific news.
4. Avoid making up any information.
"""
# Test Case
SUMMARY_PROMPT = """You are a helpful assistant for converting raw text of a stock news website into relevant text information.
1. Include key_numbers, growth_trends, overall_market_outlook, major_stock_movements, macroeconomic_numbers_or_trends, notable_company_specific_news, and a final summary.
2. Preserve as much information as you can by always adding relevant units or details.
3. Avoid making up any information.
"""
COMBINE_JSON_PROMPT = """
Combine the list of json into one json format.
"""

blocked_words = [
    "thestreet.comPlease enable JS and disable any ad blocker",
    "wsj.comPlease enable JS and disable any ad blocker",
    "Access Denied Access Denied You don't have permission to access",
    "Access to this page has been denied",
    "Sorry! Temporarily Unavailable Sorry, this page is temporarily unavailable for technical reasons."
]


In [65]:
temp_df = df.iloc[:1]

In [66]:
temp_df

Unnamed: 0,timestamp,ticker,url,summary,sentiment_score,sentiment_label,text
0,2022-03-01,AAPL,https://www.aljazeera.com/economy/2022/3/1/us-...,A surge in oil sent shivers through risky asse...,-0.24775,Somewhat-Bearish,"US stocks fall, oil tops $105 as Ukraine crisi..."


In [None]:
document_path = os.path.join(document_dir, f"{ticker}.csv")
document_df = pd.DataFrame(columns=["row_idx", "doc_idx", "timestamp", "summary"])
document_df.to_csv(document_path, index=False)

In [91]:
guided_json = load_json('../templates/guided_json_summary_v0.3.json')

document_queue = Queue(maxsize=10)
document_lock = Lock()

def worker(queue):
    while True:

        data = queue.get()
        if data is None:
            break
        row_idx, doc_idx, timestamp, document = data

        with document_lock:
            document_df = pd.read_csv(document_path)

        if len(document_df[(document_df['row_idx'] == row_idx) & \
                        (document_df['doc_idx'] == doc_idx) &\
                        (document_df['timestamp'] == timestamp)]) == 0:
            result = process_document(SUMMARY_PROMPT, document, guided_json)
            new_row = pd.DataFrame({"row_idx": [row_idx], "doc_idx": [doc_idx], "timestamp": timestamp, "summary": [str(result)]})
            with document_lock:
                document_df = pd.read_csv(document_path)
                document_df = pd.concat([document_df, new_row], ignore_index=True)
                document_df.to_csv(document_path, index=False)
        queue.task_done()


doc_workers = 4
doc_threads = []
for i in range(doc_workers):
    thread = Thread(target=worker, args=(document_queue,))
    thread.start()
    doc_threads.append(thread)

# Add documents to the queue
for row_idx, row in tqdm(temp_df.iterrows(), total=temp_df.shape[0]):
    if any(p.lower() in row['text'].strip().lower() for p in blocked_words):
        continue

    documents = chunk_documents(tokenizer, row['text'], 2048, overlap=512)
    for doc_idx, doc in enumerate(documents):
        document_queue.put(( row_idx, doc_idx, row['timestamp'], doc))

document_queue.join()

# Stop the worker threads
for i in range(doc_workers):
    document_queue.put(None)
for thread in doc_threads:
    thread.join()

100%|██████████| 1/1 [00:00<00:00, 166.09it/s]


In [42]:
result_dict = combine_results(document_summaries[0])
final_summary = process_document(COMBINE_JSON_PROMPT, str(result_dict), guided_json)