In [38]:
from utils import *
import rich
from pydantic import BaseModel, Field
from typing import Literal, List, Optional
from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import OutputFixingParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

In [39]:
load_dotenv()

True

In [40]:
file_list = get_file_paths('output', file_pattern='.txt')
dfs = [pd.read_csv(file, sep='\t') for file in file_list]
dfs = [df.drop_duplicates(subset=["description"]) for df in dfs]
df = pd.concat(dfs, ignore_index=True)
len(df)

14347

In [41]:
df = df.drop_duplicates(subset=["description"])
len(df)

6113

In [42]:
df['pubDate'] = pd.to_datetime(df['pubDate'])
df['pubDate'] = df['pubDate'].dt.strftime('%Y-%m-%d')


In [43]:
df

Unnamed: 0,link,guid,type,id,sponsored,title,description,pubDate
0,https://www.cnbc.com/2025/07/30/microsoft-mark...,108179389,cnbcnewsstory,108179389,False,Microsoft tops $4 trillion in market cap after...,"Based on its post-market trading, Microsoft ha...",2025-07-30
1,https://www.cnbc.com/2025/07/30/metas-reality-...,108178021,cnbcnewsstory,108178021,False,Meta’s Reality Labs posts $4.53 billion loss i...,Meta’s Reality Labs posts $4.53 billion loss i...,2025-07-30
2,https://www.cnbc.com/2025/07/25/india-under-pr...,108171329,cnbcnewsstory,108171329,False,Trump has slapped steep tariffs on India. Here...,U.S. President Donald Trump on Wednesday annou...,2025-07-31
3,https://www.cnbc.com/2025/07/31/trumps-aug-1-t...,108178228,cnbcnewsstory,108178228,False,Trump’s tariff deadline is near. Here’s a look...,The U.S. has managed to make only eight deals ...,2025-07-31
4,https://www.cnbc.com/2025/07/30/fed-leaves-int...,108179045,cnbcnewsstory,108179045,False,"Divided Fed holds key interest rate steady, de...",A divided Federal Reserve on Wednesday voted t...,2025-07-30
...,...,...,...,...,...,...,...,...
14282,https://www.cnbc.com/2026/01/23/india-live-ent...,108249766,cnbcnewsstory,108249766,False,India’s youth are turning concerts into an eco...,India saw a 17% rise in the live events space ...,2026-01-23
14283,https://www.cnbc.com/2026/01/23/japan-december...,108255438,cnbcnewsstory,108255438,False,"Japan inflation cools to 2.1%, lowest since Ma...",Core inflation touched its lowest level since ...,2026-01-23
14284,https://www.cnbc.com/2026/01/22/musk-says-tesl...,108255973,cnbcnewsstory,108255973,False,Elon Musk says Tesla taking safety supervisors...,Tesla CEO Elon Musk said on X that his EV comp...,2026-01-23
14285,https://www.cnbc.com/2026/01/22/intel-intc-ear...,108255813,cnbcnewsstory,108255813,False,"Intel stock plunges 13% on soft guidance, conc...",Intel reported fourth-quarter earnings Thursda...,2026-01-22


In [44]:
dfs = [g.copy() for _, g in df.groupby('pubDate')]
len(dfs)

186

In [45]:
dfs = [ df.assign(description=df['title'] + ': ' + df['description']) for df in dfs ]
dfs = [ df.assign(description=df['pubDate'] + ': ' + df['description']) for df in dfs ]
docs = [df_to_docs(df, content_column='description', metadata_columns=['link', 'guid', 'type', 'id', 'sponsored', 'pubDate']) for df in dfs]
len(docs)


186

In [22]:
# docs = [doc[:5] for doc in docs]
# len(docs)

In [46]:
joined_contents = [".".join(doc.page_content for doc in sublist) for sublist in docs]
# len(joined_contents)
joined_contents = joined_contents[:5]

In [26]:
def extract_info_from_rss(all_text):
    # all_text = ". ".join(doc.page_content for doc in docs)

    # Define a rich persona in the system message with added expertise
    system_message = SystemMessagePromptTemplate.from_template(
        """You are Ava, a sharp and insightful trader assistant of a hedge fund.
    You provide clear, concise, and actionable insights based on news feeds focusing on sectors rotation.
    In addition, for each sector provide outlook, primary catalyst, and trading insights.
    In addition, Describe more pro-cyclical and geopolitical shift if any.
    In addition, list bullish and bearish sectors/companies to look at into 2026.
    In addition, provide TACO (Trump Always Chickens Out) trade if any.
    Maintain a friendly, confident, and professional tone, making complex concepts accessible and useful."""
    )

    # Define the human message template with instructions and JSON schema
    human_message = HumanMessagePromptTemplate.from_template(
        """Extract the information required from the news feed below.


    News feed:
    {news_feed}

    """
    )
    # Create the chat prompt template
    chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])


    llm = ChatOpenAI(temperature=0)
    chain = chat_prompt | llm | StrOutputParser()

    result = chain.invoke({"news_feed": all_text})
    return(result)

In [27]:
infos = [extract_info_from_rss(doc) for doc in joined_contents]
[rich.print(info) for info in infos]

[None, None]

In [28]:
def extract_info_from_rss(all_text):

    # Define the Pydantic model for structured output
    class Sector(BaseModel):
        Date: str = Field(..., description='The publication date of the news')
        Name: Literal[
            "Commercial Services",
            "Communications",
            "Consumer Durables",
            "Consumer Non-Durables",
            "Consumer Services",
            "Distribution Services",
            "Electronic Technology",
            "Energy Minerals",
            "Finance",
            "Health Services",
            "Health Technology",
            "Industrial Services",
            "Non-Energy Minerals",
            "Process Industries",
            "Producer Manufacturing",
            "Retail Trade",
            "Technology Services",
            "Transportation",
            "Utilities"
            ] = Field(..., description="Name of the sector")
        Outlook: str = Field(..., description='A sector outlook describes the expected future performance and conditions based on data, trends, and risks.')
        Catalyst: str = Field(..., description="A primary catalyst of a sector is the single most influential force that is expected to drive major change—positive or negative—across an entire industry.")
        Trading_insights: str = Field(..., description="Practical, actionable interpretation of the sector’s expected future conditions—something that can guide investment or trading decisions.")
        Direction_momentum: Literal["Strength", "Weakness", "Volatility", "Rotation in favor", "Rotation out of favor"] = Field(..., description="The direction of momentum of a sector is the prevailing trend in how that sector’s prices, performance, and investor sentiment are moving over a given period.")
        Sector_vs_market_position: Literal['Leading', 'Lagging', 'In Line'] = Field(..., description="Relative positioning of a sector to the overall market is the sector’s performance and strength compared to the broader market, showing whether it is leading, lagging, or moving in line with the market trend.")
    
    
    # Create the base JSON output parser
    base_parser = JsonOutputParser(pydantic_object=Sector)

    # Create the output-fixing parser wrapping the base parser and using an LLM to fix errors
    llm_for_fixing = ChatOpenAI(temperature=0)
    fixing_parser = OutputFixingParser.from_llm(parser=base_parser, llm=llm_for_fixing)

    # Define a rich persona in the system message with added expertise
    system_message = SystemMessagePromptTemplate.from_template(
        """You are Ava, a sharp and insightful trader assistant skilled is deriving sector trading insights from news."""
    )

    # Define the human message template with instructions and JSON schema
    human_message = HumanMessagePromptTemplate.from_template(
        """Extract information required.


    News feed:
    {news_feed}

    """
    )

    # Create the chat prompt template
    chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

    # Compose the chain: prompt -> LLM -> fixing parser
    llm = ChatOpenAI(temperature=0)
    chain = chat_prompt | llm | fixing_parser
    # all_text = ". ".join(doc.page_content for doc in docs)
    result = chain.invoke({"news_feed": all_text})
    return result

# extract_info_from_rss(joined_contents[0])



In [47]:
joined_contents

['2025-07-21: Chinese companies are waging fierce price wars — but the consumer isn’t always the winner: Though consumers may be lured by ultra-cheap deals, the trade-offs for them are more complicated than they might seem.',
 '2025-07-22: Japan could be in a weaker position in trade talks after Ishiba\'s upper house defeat, experts say:  \u2060\\Trump will show no mercy to [either] a lame duck Ishiba or because of a political vacuum,\\" advisory firm Quantum Strategy said. ".2025-07-22: How China’s $7 billion micro drama industry is taking on the U.S. entertainment industry: China\'s micro drama market surpassed the country\'s box-office revenue for the first time in 2024..2025-07-22: China\'s affluent are as pessimistic about the economy as they were during the Covid-19 pandemic: A study of affluent Chinese by Oliver Wyman found 22% of respondents were negative about the economy when surveyed in May, just above the 21% in October 2022..2025-07-22: The race to roll out solid-state bat

In [48]:
[rich.print(doc) for doc in joined_contents]

[None, None, None, None, None]

In [49]:
infos = [extract_info_from_rss(doc) for doc in joined_contents]
[rich.print(info) for info in infos]

[None, None, None, None, None]

In [58]:
def extract_info_from_rss(all_text):



    # Keep your existing Sector model unchanged
    class Sector(BaseModel):
        Date: str = Field(..., description='The publication date of the news')
        Name: Literal[
            "Commercial Services",
            "Communications",
            "Consumer Durables",
            "Consumer Non-Durables",
            "Consumer Services",
            "Distribution Services",
            "Electronic Technology",
            "Energy Minerals",
            "Finance",
            "Health Services",
            "Health Technology",
            "Industrial Services",
            "Non-Energy Minerals",
            "Process Industries",
            "Producer Manufacturing",
            "Retail Trade",
            "Technology Services",
            "Transportation",
            "Utilities"
            ] = Field(..., description="Name of the sector")
        Outlook: str = Field(..., description='A sector outlook describes the expected future performance and conditions based on data, trends, and risks.')
        Catalyst: str = Field(..., description="A primary catalyst of a sector is the single most influential force that is expected to drive major change—positive or negative—across an entire industry.")
        Trading_insights: str = Field(..., description="Practical, actionable interpretation of the sector's expected future conditions—something that can guide investment or trading decisions.")
        Direction_momentum: Literal["Strength", "Weakness", "Volatility", "Rotation in favor", "Rotation out of favor"] = Field(..., description="The direction of momentum of a sector is the prevailing trend in how that sector's prices, performance, and investor sentiment are moving over a given period.")
        Sector_vs_market_position: Literal['Leading', 'Lagging', 'In Line'] = Field(..., description="Relative positioning of a sector to the overall market is the sector's performance and strength compared to the broader market, showing whether it is leading, lagging, or moving in line with the market trend.")

    # New wrapper model for multiple sectors
    class MultiSectorAnalysis(BaseModel):
        """Analysis of multiple sectors from news feed."""
        sectors: List[Sector] = Field(
            ...,
            description="List of ALL relevant sectors found in the news (1-8 max). Only include sectors with clear evidence.",
            default_factory=list
        )
    
    # 1. Create messages WITHOUT partial first
    system_message = SystemMessagePromptTemplate.from_template(
        """You are Ava, a sharp trader assistant. Analyze the FULL news feed and extract **ALL relevant sectors** mentioned.

    CRITICAL RULES:
    - Output ONLY valid JSON matching this schema: {schema}
    - If NO sector: use empty [] 
    - Each sector must use a Name from the enum
    - Fill ALL required fields for each sector
    - No duplicate sectors
    - No extra text outside JSON"""
    )

    human_message = HumanMessagePromptTemplate.from_template(
        """News feed:
    {news_feed}"""
    )

    # 2. Create ChatPromptTemplate
    chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

    # 3. NOW partial with schema (after creation)
    base_parser = JsonOutputParser(pydantic_object=MultiSectorAnalysis)
    chat_prompt = chat_prompt.partial(schema=base_parser.get_format_instructions())

    # 4. Build chain
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    chain = chat_prompt | llm.with_structured_output(MultiSectorAnalysis)

    # Invoke
    result = chain.invoke({"news_feed": all_text})
    # results = [chain.invoke({"news_feed": doc}) for doc in docs[:5]]
    return result



In [59]:
infos = [extract_info_from_rss(doc) for doc in joined_contents]
[rich.print(info) for info in infos]

[None, None, None, None, None]

In [54]:
def extract_info_from_rss(all_text):



    class TradingEntity(BaseModel):
        """Single trading signal."""
        Date: str = Field(..., description='The publication date of the news')
        name: str = Field(..., description="Sector or company name (exact match from news)")
        note: str = Field("", description="Optional caution/note like '(with caution)'")

    class TradingSignals(BaseModel):
        """Bullish vs Bearish signals from news."""
        bullish: List[TradingEntity] = Field(
            default_factory=list,
            description="Bullish sectors/companies with growth momentum"
        )
        bearish: List[TradingEntity] = Field(
            default_factory=list, 
            description="Bearish sectors/companies with risks/weakness"
        )

    # Prompt - NO {schema} variable needed!
    system_message = SystemMessagePromptTemplate.from_template(
        """You are Ava, elite trader scanning news for actionable signals.

    From the news, extract **BULLISH** (growth/strength) vs **BEARISH** (risks/weakness) sectors/companies.

    Output as JSON array of exact names from news. Use "note" for cautions.

    If NO bullish signals: use empty [] for bullish
    If NO bearish signals: use empty [] for bearish 

    Examples:
    - Bullish: "Private credit sector", note="with caution"
    - Bearish: "European carmakers"

    Rules:
    - 3-8 per list maximum
    - Only signals with clear news evidence
    - Companies + sectors both OK"""
    )

    human_message = HumanMessagePromptTemplate.from_template(
        """News feed:
    {news_feed}"""
    )

    # Create prompt (variables: only {news_feed})
    chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

    # with_structured_output AUTOMATICALLY injects schema!
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    chain = chat_prompt | llm.with_structured_output(TradingSignals)

    # Run
    # all_text = ". ".join(doc.page_content for doc in docs)
    result = chain.invoke({"news_feed": all_text})

    return result



In [55]:
[rich.print(extract_info_from_rss(cont)) for cont in joined_contents]

[None, None, None, None, None]

In [56]:
def extract_info_from_rss(all_text):


    class TradingEntity(BaseModel):
        """Detailed trading signal with conviction & catalysts."""
        Date: str = Field(..., description='The publication date of the news')
        name: str = Field(..., description="Exact sector/company name from news")
        
        # Trading conviction (strength of signal)
        conviction: Literal["High", "Medium", "Low"] = Field(
            ..., 
            description="Signal strength: High=strong evidence, Low=emerging/weak"
        )
        
        # Key catalyst driving the signal
        catalyst: str = Field(
            ..., 
            description="Primary news catalyst (1 sentence max)"
        )
        
        # Optional caution/risk
        note: Optional[str] = Field(
            None, 
            description="Risks/cautions like '(regulatory risk)' or '(overbought)'"
        )
        
        # Price action suggestion
        action: Literal["Buy", "Sell", "Watch", "Avoid"] = Field(
            ..., 
            description="Clear trading action"
        )

    class TradingSignals(BaseModel):
        """Detailed bullish/bearish signals."""
        bullish: List[TradingEntity] = Field(min_items=1, max_items=10)
        bearish: List[TradingEntity] = Field(min_items=1, max_items=10)

    # Prompt - NO {schema} variable needed!
    system_message = SystemMessagePromptTemplate.from_template(
        """Ava: Elite trader extracting HIGH-CONVICTION signals from news.

    For each signal provide:
    - name: Exact sector/company from news
    - conviction: High/Medium/Low (evidence strength)  
    - catalyst: 1-sentence key driver
    - action: Buy/Sell/Watch/Avoid
    - note: Risks/cautions (optional)

    Examples:
    - High conviction Buy: "Private credit", catalyst="3.4T→4.9T growth", note="regulatory risk"

    Only clear signals."""
    )

    human_message = HumanMessagePromptTemplate.from_template(
        """News feed:
    {news_feed}"""
    )

    # Create prompt (variables: only {news_feed})
    chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

    # with_structured_output AUTOMATICALLY injects schema!
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    chain = chat_prompt | llm.with_structured_output(TradingSignals)

    # Run
    # all_text = ". ".join(doc.page_content for doc in docs)
    result = chain.invoke({"news_feed": all_text})

    return result



In [57]:
[rich.print(extract_info_from_rss(cont)) for cont in joined_contents]

ValidationError: 1 validation error for TradingSignals
bullish
  List should have at least 1 item after validation, not 0 [type=too_short, input_value=[], input_type=list]
    For further information visit https://errors.pydantic.dev/2.9/v/too_short