In [8]:
from utils import *

from pydantic import BaseModel, Field
from typing import Literal
from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import OutputFixingParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI

In [9]:
load_dotenv()

file_list = get_file_paths('output', file_pattern='txt')
dfs = [pd.read_csv(file, sep='\t') for file in file_list]
dfs = pd.concat(dfs, ignore_index=True)
dfs['description'] = dfs['title'] + '. ' + dfs['description']


In [10]:
dfs.head()

Unnamed: 0,link,guid,type,id,sponsored,title,description,pubDate
0,https://www.cnbc.com/2025/07/30/microsoft-mark...,108179389,cnbcnewsstory,108179389,False,Microsoft tops $4 trillion in market cap after...,Microsoft tops $4 trillion in market cap after...,"Wed, 30 Jul 2025 22:27:00 GMT"
1,https://www.cnbc.com/2025/07/30/metas-reality-...,108178021,cnbcnewsstory,108178021,False,Meta’s Reality Labs posts $4.53 billion loss i...,Meta’s Reality Labs posts $4.53 billion loss i...,"Wed, 30 Jul 2025 20:20:49 GMT"
2,https://www.cnbc.com/2025/07/25/india-under-pr...,108171329,cnbcnewsstory,108171329,False,Trump has slapped steep tariffs on India. Here...,Trump has slapped steep tariffs on India. Here...,"Thu, 31 Jul 2025 04:28:17 GMT"
3,https://www.cnbc.com/2025/07/31/trumps-aug-1-t...,108178228,cnbcnewsstory,108178228,False,Trump’s tariff deadline is near. Here’s a look...,Trump’s tariff deadline is near. Here’s a look...,"Thu, 31 Jul 2025 03:49:02 GMT"
4,https://www.cnbc.com/2025/07/30/fed-leaves-int...,108179045,cnbcnewsstory,108179045,False,"Divided Fed holds key interest rate steady, de...","Divided Fed holds key interest rate steady, de...","Wed, 30 Jul 2025 20:08:27 GMT"


In [11]:
docs = df_to_docs(dfs, content_column='description', metadata_columns=['link', 'guid', 'type', 'id', 'sponsored', 'pubDate'])
docs[:6]

[Document(metadata={'link': 'https://www.cnbc.com/2025/07/30/microsoft-market-cap-tops-4-trillion-after-hours-on-earnings-beat.html', 'guid': '108179389', 'type': 'cnbcnewsstory', 'id': '108179389', 'sponsored': 'False', 'pubDate': 'Wed, 30 Jul 2025 22:27:00 GMT'}, page_content="Microsoft tops $4 trillion in market cap after hours, joining Nvidia in exclusive club. Based on its post-market trading, Microsoft has become the world's second $4 trillion company following quarterly earnings.  "),
 Document(metadata={'link': 'https://www.cnbc.com/2025/07/30/metas-reality-labs-second-quarter-2025.html', 'guid': '108178021', 'type': 'cnbcnewsstory', 'id': '108178021', 'sponsored': 'False', 'pubDate': 'Wed, 30 Jul 2025 20:20:49 GMT'}, page_content='Meta’s Reality Labs posts $4.53 billion loss in second quarter. Meta’s Reality Labs posts $4.53 billion loss in second quarter'),
 Document(metadata={'link': 'https://www.cnbc.com/2025/07/25/india-under-pressure-to-seal-trade-deal-with-us-as-deadline

In [17]:
# Define the Pydantic model for structured output
class Insights(BaseModel):
    company_or_institution: str = Field(..., description="Name of the listed company or institution")
    sector: Literal[
        "Commercial Services",
        "Communications",
        "Consumer Durables",
        "Consumer Non-Durables",
        "Consumer Services",
        "Distribution Services",
        "Electronic Technology",
        "Energy Minerals",
        "Finance",
        "Health Services",
        "Health Technology",
        "Industrial Services",
        "Non-Energy Minerals",
        "Process Industries",
        "Producer Manufacturing",
        "Retail Trade",
        "Technology Services",
        "Transportation",
        "Utilities"
        ] = Field(..., description="Name of the sector")
    asset_type: str = Field(..., description='Type of asset to trade')
    trading_decision: Literal["Buy", "Sell"] = Field(..., description="The trading decision signal")
    # signal: str = Field(..., description="Signal: buy or sell")
    motivation: str = Field(..., description="Reason for the trading decision, max 20 words")
    news_topic: str = Field(..., description="Topic of the news, e.g., balance sheet, market share, new appointments, or other. max 3 words")

In [18]:

# Create the base JSON output parser
base_parser = JsonOutputParser(pydantic_object=Insights)

# Create the output-fixing parser wrapping the base parser and using an LLM to fix errors
llm_for_fixing = ChatOpenAI(temperature=0)
fixing_parser = OutputFixingParser.from_llm(parser=base_parser, llm=llm_for_fixing)

# Define a rich persona in the system message with added expertise
system_message = SystemMessagePromptTemplate.from_template(
    """You are Ava, a sharp and insightful trader assistant with deep expertise in quantitative finance, advanced statistical models, and short selling techniques.
You provide clear, concise, and actionable investment insights based on news feeds.
Maintain a friendly, confident, and professional tone, making complex concepts accessible and useful."""
)

# Define the human message template with instructions and JSON schema
human_message = HumanMessagePromptTemplate.from_template(
    """Extract the investment decision from the news feed below.


News feed:
{news_feed}

"""
)

# Create the chat prompt template
chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

# Compose the chain: prompt -> LLM -> fixing parser
llm = ChatOpenAI(temperature=0)
chain = chat_prompt | llm | fixing_parser

# Example news feed
news = docs[0].page_content

# Invoke the chain
result = chain.invoke({"news_feed": news})
print(result)

{'company_or_institution': 'Microsoft', 'sector': 'Technology Services', 'asset_type': 'Stock', 'trading_decision': 'Buy', 'motivation': 'Strong quarterly earnings and market cap surpassing $4 trillion', 'news_topic': 'Market Cap, Earnings, Growth'}


In [15]:
batch_inputs = []
for doc in docs:
    metadata_text = "\n".join(f"{k}: {v}" for k, v in (doc.metadata or {}).items())
    news_feed = f"{doc.page_content}\n\nMetadata:\n{metadata_text}"
    batch_inputs.append({"news_feed": news_feed})

batch_inputs

[{'news_feed': "Microsoft tops $4 trillion in market cap after hours, joining Nvidia in exclusive club. Based on its post-market trading, Microsoft has become the world's second $4 trillion company following quarterly earnings.  \n\nMetadata:\nlink: https://www.cnbc.com/2025/07/30/microsoft-market-cap-tops-4-trillion-after-hours-on-earnings-beat.html\nguid: 108179389\ntype: cnbcnewsstory\nid: 108179389\nsponsored: False\npubDate: Wed, 30 Jul 2025 22:27:00 GMT"},
 {'news_feed': 'Meta’s Reality Labs posts $4.53 billion loss in second quarter. Meta’s Reality Labs posts $4.53 billion loss in second quarter\n\nMetadata:\nlink: https://www.cnbc.com/2025/07/30/metas-reality-labs-second-quarter-2025.html\nguid: 108178021\ntype: cnbcnewsstory\nid: 108178021\nsponsored: False\npubDate: Wed, 30 Jul 2025 20:20:49 GMT'},
 {'news_feed': 'Trump has slapped steep tariffs on India. Here\'s why New Delhi did not rush into a deal with Washington. U.S. President Donald Trump on Wednesday announced 25% dut

In [19]:
result = chain.batch(batch_inputs[:5])
result

[{'company_or_institution': 'Microsoft',
  'sector': 'Technology Services',
  'asset_type': 'Stock',
  'trading_decision': 'Buy',
  'motivation': 'Market Cap Over $4 Trillion, Strong Quarterly Earnings',
  'news_topic': 'Market Share Growth'},
 {'company_or_institution': "Meta's Reality Labs",
  'sector': 'Technology Services',
  'asset_type': 'Stock',
  'trading_decision': 'Sell',
  'motivation': 'significant financial setback',
  'news_topic': 'Financial Performance'},
 {'company_or_institution': 'N/A',
  'sector': 'N/A',
  'asset_type': 'N/A',
  'trading_decision': 'N/A',
  'motivation': 'Monitor impact of tariffs on imports from India',
  'news_topic': 'Tariff Impact on Imports'},
 {'company_or_institution': 'U.S.',
  'sector': 'Trade',
  'asset_type': 'Tariffs',
  'trading_decision': 'Monitor',
  'motivation': 'Potential impact on international trade and market dynamics',
  'news_topic': 'Tariff Deadline'},
 {'company_or_institution': 'Federal Reserve',
  'sector': 'Finance',
  'a