In [1]:
import pandas as pd
from glob import glob
import json
from collections import defaultdict
import ast
from src.vllm import llm_chat, message_template, call_llm_chat
from templates.PROMPTS import Prompts
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from threading import Thread
# check size of summary value
from transformers import AutoTokenizer
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
summary_path = "/data/kai/forecasting/data/summary_v0.2"
ticker = "AMD"
save_dir = os.path.join(summary_path, ticker)
os.makedirs(save_dir, exist_ok=True)


model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
# model_name = "casperhansen/llama-3-70b-instruct-awq"
tokenizer = AutoTokenizer.from_pretrained(model_name)

paths = sorted(glob(f"/data/kai/forecasting/data/document_v0.2/{ticker}/*.csv"))
prompts = Prompts('AMD')
guided_json = json.load(open("/data/kai/forecasting/multimodal/financial/templates/guided_json_summary_v0.3.json", 'r'))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
COMBINE_JSON_PROMPT = \
f"""For the list of stock news of {ticker} in JSON format, combine them into one json format.

1. Combine the list of key_numbers, growth_trends, overall_market_outlook, major_stock_movements, significant_economic_indicators, notable_company_specific_news, and a final summary.
2. Reorder the text as needed.
2. Retain as much information and always adding relevant units or details.
3. Avoid making up any information.
4. Provide a concise summary without using introductory phrases like 'Here is a summary of ___' or similar. Focus directly on the key points.
"""

In [161]:
import numpy as np

# parse raw JSON
def collapse_metrics(json_summaries):
    if type(json_summaries) == np.ndarray:
        data = [{k: v for k, v in ast.literal_eval(summary).items() if v} for summary in json_summaries]
    elif type(json_summaries) == list:
        data = [{k: v for k, v in summary.items() if v} for summary in json_summaries]
    elif type(json_summaries) == str:
        data = [{k: v for k, v in ast.literal_eval(json_summaries).items()}]
    collapsed_data = []

    for entry in data:
        new_entry = {}
        for key, value in entry.items():
            if isinstance(value, list):
                new_value = []
                for item in value:
                    if isinstance(item, dict):
                        if 'metric' in item and 'value' in item:
                            new_value.append(f"{item['metric']}: {item['value']}")
                        elif 'key' in item and 'value' in item:
                            new_value.append(f"{item['key']}: {item['value']}")
                    else:
                        new_value.append(item)
                new_entry[key] = new_value
            else:
                new_entry[key] = value
        if len(new_entry) != 0:
            collapsed_data.append(new_entry)

    return collapsed_data

# chunk JSON summary in a list
def chunk_summaries(tokenizer, summaries: list[dict], max_split_token=4096):
    """
    Chunks summaries into summaries so that the key, item pair does not get cut off.
    """
    chunks = []
    for summary in summaries:
        for k, val in summary.items():
            new_chunk = f"{k} [{val}]" if k == "summary" else f"{k} {val}"
            
            if not chunks or len(tokenizer.encode(chunks[-1] + ", " + new_chunk)) > max_split_token:
                chunks.append(new_chunk)
            else:
                chunks[-1] += ", " + new_chunk
    return chunks



def collapse_results(results):
    result = []
    for r in results:
        result += r
    return result

def combine_results(results):
    results_dict = defaultdict(list)

    for result in results:
        if type(result) == str:
            json_data = ast.literal_eval(result)
        else:
            json_data = result
        for key in json_data:
            # For lists of dictionaries (like 'key_numbers')
            if isinstance(json_data[key], list):
                results_dict[key].extend(json_data[key])
            else:
                results_dict[key].append(json_data[key])
    return results_dict


def llm_combine_chunk(chunk):
    output = llm_chat(message_template(COMBINE_JSON_PROMPT, chunk), model=model_name, guided_json=guided_json)
    cleaned_output = collapse_metrics(output)
    return cleaned_output

def batch_llm_combine_summaries(summaries):
    summary_chunks = chunk_summaries(tokenizer, summaries)
    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = [executor.submit(llm_combine_chunk, chunk) for chunk in summary_chunks]
        results = [future.result() for future in as_completed(futures)]
    return results


In [None]:
combine_results()

In [227]:
COMBINE_JSON_PROMPT = \
            f"""For the list of stock news of {ticker} in JSON format, combine them into one json format.

1. Combine the list of key_numbers, growth_trends, overall_market_outlook, major_stock_movements, significant_economic_indicators, notable_company_specific_news, and a final summary.
2. Reorder the text as needed.
2. Remove repetitive details and always add relevant units or details
3. Avoid making up any information and repeating information.
4. Provide a concise summary without using introductory phrases like 'Here is a summary of ___' or similar. Focus directly on the key points.
"""

In [233]:
path = "/data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv"

In [228]:
json_summaries = pd.read_csv(path)['summary'].values
# json_summaries = pd.read_csv("/data/kai/forecasting/data/document_v0.2/AMD/2022-03-04.csv")['summary'].values
cleaned_results = collapse_metrics(json_summaries) # list[{k: v}, {k: {k:v}}]

print("First combination task...")
json_results = batch_llm_combine_summaries(cleaned_results)
formatted_results = collapse_metrics(collapse_results(json_results)) # this is like cleaned_results
def get_current_summary_length(results):
    return len(combine_results(results)["summary"])

combine_count = 0
while get_current_summary_length(formatted_results) > 1:
    combine_count += 1
    print(f"Combining summaries {combine_count} ... length summaries: {get_current_summary_length(formatted_results)}")
    json_results = batch_llm_combine_summaries(formatted_results)
    formatted_results = collapse_metrics(collapse_results(json_results)) # this is like cleaned_results

First combination task...
Combining summaries 1 ... length summaries: 4
Combining summaries 2 ... length summaries: 2


In [234]:
timestamp = path.split('/')[-1].split('.csv')[0]
save_path = os.path.join(save_dir, timestamp + ".json")

In [244]:
with open(save_path, 'w') as json_file:
    json.dump(json_results[0][0], json_file, indent=4)

In [243]:
json_results[0][0]

{'key_numbers': ['Revenue: $5,887 million',
  'Gross Margin: 48%',
  'Operating Income: $951 million',
  'Net Income: $786 million',
  'Earnings Per Share (EPS): $0.56',
  'Year-over-year growth: 71%',
  'Quarter-over-quarter growth: 22%',
  'Non-GAAP gross margin: 53%',
  'Non-GAAP operating income: $1.8 billion',
  'Non-GAAP net income: $1.6 billion',
  'Non-GAAP earnings per share: $1.13',
  'Adjusted EBITDA: $1,967 million',
  'Free Cash Flow: $924 million',
  'Annual Revenue Forecast: well above estimates',
  'Pre-market Trade: +7%',
  'Current Stock Price: $89.66',
  'Expected EPS for Q1: $0.91',
  'Expected Revenue for Q1: $5.52 billion',
  '52-week High: $164.46',
  '52-week Low: $72.50',
  'Revenue: 9.3 billion',
  'Operating Income: -34%',
  'Market Cap: 626B',
  "Today's Change: -4.82%",
  'Current Price: 196.37',
  'Annual Revenue Growth: 195%',
  'Operating Margins: 19.2%',
  'Diluted EPS: 2.86',
  'Revenue: 18.8 billion',
  'Revenue Growth: 53%',
  'Research and Developme

In [230]:
formatted_results

[{'key_numbers': ['Revenue: $5,887 million',
   'Gross Margin: 48%',
   'Operating Income: $951 million',
   'Net Income: $786 million',
   'Earnings Per Share (EPS): $0.56',
   'Year-over-year growth: 71%',
   'Quarter-over-quarter growth: 22%',
   'Non-GAAP gross margin: 53%',
   'Non-GAAP operating income: $1.8 billion',
   'Non-GAAP net income: $1.6 billion',
   'Non-GAAP earnings per share: $1.13',
   'Adjusted EBITDA: $1,967 million',
   'Free Cash Flow: $924 million',
   'Annual Revenue Forecast: well above estimates',
   'Pre-market Trade: +7%',
   'Current Stock Price: $89.66',
   'Expected EPS for Q1: $0.91',
   'Expected Revenue for Q1: $5.52 billion',
   '52-week High: $164.46',
   '52-week Low: $72.50',
   'Revenue: 9.3 billion',
   'Operating Income: -34%',
   'Market Cap: 626B',
   "Today's Change: -4.82%",
   'Current Price: 196.37',
   'Annual Revenue Growth: 195%',
   'Operating Margins: 19.2%',
   'Diluted EPS: 2.86',
   'Revenue: 18.8 billion',
   'Revenue Growth: 5

In [53]:

def combine_summaries(summary_path, save_dir):
    timestamp = summary_path.split('/')[-1].split('.csv')[0]
    save_path = os.path.join(save_dir, timestamp + ".json")

    if not os.path.exists(save_path):
        summary_values = pd.read_csv(summary_path)['summary'].values
        summary_values = combine_until_one(summary_values)
        print(save_path)

        combined = combine_results(summary_values)
        messages = message_template(COMBINE_JSON_PROMPT, str(dict(combined)))

        response = llm_chat(messages, "casperhansen/llama-3-70b-instruct-awq", guided_json)
        response_data = ast.literal_eval(response)

        with open(save_path, 'w') as json_file:
            json.dump(json.loads(response_data), json_file, indent=4)
        
        return save_path
    return None


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [55]:
combine_summaries("/data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv", save_dir)

Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/data/document_v0.2/AMD/2022-05-04.csv
Summarizing 5 /data/kai/forecasting/da

KeyboardInterrupt: 

In [8]:
summary_workers = 1
with ThreadPoolExecutor(max_workers=summary_workers) as executor:
    future_to_path = {executor.submit(combine_summaries, path, save_dir): path for path in paths}
    for future in as_completed(future_to_path):
        path = future_to_path[future]
        try:
            result = future.result()
            if result:
                print(f"Processed and saved: {result}")
        except Exception as exc:
            print(f"{path} generated an exception: {exc}")

/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-14.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-15.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-16.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-04.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-17.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-18.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-08.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-19.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-21.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-22.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-26.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-24.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-25.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-12.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-23.json
/data/kai/forecasting/data/summary_v0.2/AMD/2022-03-27.json
/data/kai/forecasting/data/summary_v0.2/

In [92]:
timestamp

'2022-03-03'

In [91]:
print(response_data)

{'key_numbers': [], 'growth_trends': [], 'overall_market_outlook': [], 'major_stock_movements': [], 'significant_economic_indicators': [], 'notable_company_specific_news': ['Intel Corp forecast second-quarter revenue and profit below Wall Street expectations on worries of demand weakness in its largest end market, PCs, and increased supply-chain uncertainty due to COVID-19 lockdowns in China.'], 'summary': "Intel Corp's shares fell after forecasting a gloomy quarter due to supply-chain woes, citing demand weakness in PCs and increased uncertainty from COVID-19 lockdowns in China."}
