In [1]:
# !uv add langchain
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from langchain_core.prompts import PromptTemplate
from typing import List, Dict

In [3]:
# Usage
news = """Apple Inc. reported quarterly earnings that beat Wall Street expectations, 
with revenue up 8% year-over-year driven by strong iPhone sales in emerging markets."""

In [4]:
class FinBERTSentimentAnalyzer:
    def __init__(self):
        """initialize FinBERT model"""
        model_name = "ProsusAI/finbert"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.labels = ["positive", "negative", "neutral"]
    
    def analyze(self, news_text: str) -> Dict[str, any]:
        """Analyze sentiment with probabilities"""
        inputs = self.tokenizer(news_text, return_tensors="pt",
                                truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        scores = predictions[0].tolist()
        sentiment_dict = {label: score for label, score in zip(self.labels, scores)}
        
        # Get primary sentiment
        primary_sentiment = self.labels[scores.index(max(scores))]
        confidence = max(scores)
        
        return {
            "sentiment": primary_sentiment,
            "confidence": confidence,
            "scores": sentiment_dict,
            "text": news_text
        }
    
    def batch_analyze(self, news_list: List[str]) -> List[Dict]:
        """Analyze multiple news items"""
        return [self.analyze(news) for news in news_list]


In [None]:

# Usage
news = """Apple Inc. reported quarterly earnings that beat Wall Street expectations, 
with revenue up 8% year-over-year driven by strong iPhone sales in emerging markets."""

analyzer = FinBERTSentimentAnalyzer()
result = analyzer.analyze(news)
print(f"Sentiment: {result['sentiment']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"All scores: {result['scores']}")

In [None]:
# from langchain_core.chains import LLMChain
from langchain_ollama import OllamaLLM
from langchain_core.prompts import PromptTemplate

class HybridFinancialAnalyzer:
    def __init__(self):
        self.finbert = FinBERTSentimentAnalyzer()
        self.llm = OllamaLLM(model="minimax-m2:cloud", temperature=0.1)
        
        self.explanation_prompt = PromptTemplate(
            input_variables=["news_text", "sentiment", "confidence"],
            template="""The following financial news has been classified as {sentiment} with {confidence:.1%} confidence:

News: {news_text}

Provide a detailed analysis explaining:
1. Why this sentiment classification makes sense
2. Key financial indicators or events mentioned
3. Potential market implications
4. Any risks or uncertainties

Analysis:"""
        )
        
        # self.chain = LLMChain(llm=self.llm, prompt=self.explanation_prompt)
    
    def analyze(self, news_text: str) -> Dict:
        # Get quick sentiment from FinBERT
        print("Analyzing sentiment with FinBERT...")
        finbert_result = self.finbert.analyze(news_text)

        # Create the chain using the | operator (modern LangChain syntax)
        print("Generating detailed analysis with LLM...")
        chain = self.explanation_prompt | self.llm
        # Get detailed explanation from LLM
        explanation = chain.invoke({
            "news_text": news_text,
            "sentiment": finbert_result['sentiment'],
            "confidence": finbert_result['confidence']
        })
        print("Analysis complete!")
        return {
            **finbert_result,
            "detailed_analysis": explanation
        }
# print(result)

In [14]:
# Usage
hybrid_analyzer = HybridFinancialAnalyzer()
result = hybrid_analyzer.analyze(news)
print(f"Sentiment: {result['sentiment']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"All scores: {result['scores']}")
print(result['detailed_analysis'])

Analyzing sentiment with FinBERT...
Generating detailed analysis with LLM...
Analysis complete!
Sentiment: positive
Confidence: 95.90%
All scores: {'positive': 0.9590075612068176, 'negative': 0.02066086418926716, 'neutral': 0.020331567153334618}
## Analysis of Apple Earnings News Classification

### 1. Why This Sentiment Classification Makes Sense

The 95.9% confidence positive classification is justified for several compelling reasons:

**Fundamental Beat**: Apple "beat Wall Street expectations" represents a fundamental positive surprise for investors, as beating earnings estimates typically drives stock prices higher and signals company management is outperforming market consensus.

**Growth Confirmation**: The 8% year-over-year revenue increase demonstrates continuous business growth, which is particularly important for a mature, large-cap company like Apple where investors often worry about growth deceleration.

**Market Expansion**: Strong iPhone sales in "emerging markets" (likel

In [16]:

# Complete workflow
news_retriever = NewsRetriever()
analyzer = HybridFinancialAnalyzer()

# Get news and analyze
news_items = news_retriever.get_financial_news()
for item in news_items:
    text = f"{item['title']}. {item['summary']}"
    sentiment = analyzer.analyze(text)
    print(f"\nNews: {item['title']}")
    print(f"Sentiment: {sentiment['sentiment']} ({sentiment['confidence']:.1%})")
    llm_result = analyzer.analyze(text)
    print(llm_result['detailed_analysis'])

Analyzing sentiment with FinBERT...
Generating detailed analysis with LLM...
Analysis complete!

News: Shutdown means another missed jobs report Friday. Here's what it probably would have shown
Sentiment: negative (96.1%)
Analyzing sentiment with FinBERT...
Generating detailed analysis with LLM...
Analysis complete!
**Analysis of Negative Sentiment Classification (96.1% Confidence)**

## 1. Why This Sentiment Classification Makes Sense

The 96.1% negative confidence classification is highly justified because this news represents a significant institutional failure with multiple negative implications:

- **Governance Disruption**: A "record-long government shutdown" signals systemic dysfunction in federal operations
- **Information Asymmetry**: The absence of official jobs data creates a critical information void in financial markets
- **Economic Monitoring Failure**: Jobs reports are among the most closely watched economic indicators; their absence hampers market participants' ability 

In [1]:
from newsdataapi import NewsDataApiClient
api = NewsDataApiClient(apikey='pub_965468a202be412d80928d294b632639')

In [2]:
all_articles = []
max_articles = 500
# for i in range(100):
response = api.news_api(q='stock market India economy finance',
                        category='business',
                        country='in',
                        language='en',
                        max_result=1)

  response = api.news_api(q='stock market India economy finance',


In [4]:
response.get('results',[])

[{'article_id': '3ff8df26b83c247be5d18a6e75f45b7b',
  'link': 'https://www.thehindubusinessline.com/portfolio/big-story/nifty-50-sensex-mr-perma-bull-takes-some-market-lessons-from-mr-doom/article70257284.ece',
  'title': 'Nifty 50, Sensex: Mr Perma Bull takes some market lessons from Mr Doom',
  'description': 'What is holding back markets? Two fictitious characters — Perma Bull, a long-term investor in Indian markets and a die-hard optimist, and Doom, a fundamental analyst — discuss the stock market performance',
  'content': 'ONLY AVAILABLE IN PAID PLANS',
  'keywords': ['big story'],
  'creator': ['Hari Viswanath'],
  'language': 'english',
  'country': ['india'],
  'category': ['business'],
  'pubDate': '2025-11-08 17:38:22',
  'pubDateTZ': 'UTC',
  'image_url': 'https://bl-i.thgim.com/public/incoming/vi1xwv/article70256968.ece/alternates/LANDSCAPE_1200/PO09_bull%20n%20bear.jpg',
  'video_url': None,
  'source_id': 'thehindubusinessline',
  'source_name': 'The Hindu - Business Lin

### Using Mediastack

In [6]:
import requests
api_key = 'ad0657133e24010278b9e52debe35356'
base_url = 'https://api.mediastack.com/v1/news'

params = {
    'access_key': api_key,
    'countries': 'in',
    'categories': 'business',
    'keywords': 'market stock rupee economy',
    'limit': 100
}
response = requests.get(base_url, params=params)

In [8]:
response.json().get('data', [])

[]

In [1]:
import http.client, urllib.parse

conn = http.client.HTTPConnection('api.mediastack.com')

params = urllib.parse.urlencode({
    'access_key': 'ad0657133e24010278b9e52debe35356',
    'categories': 'business',
    'countries': 'in',
    'keywords': 'market stock rupee economy',
    'limit': 10,
    })

conn.request('GET', '/v1/news?{}'.format(params))

res = conn.getresponse()
data = res.read()

print(data.decode('utf-8'))

{"pagination":{"limit":10,"offset":0,"count":0,"total":0},"data":[]}


In [3]:
import requests

# Use HTTP (not HTTPS) for free tier
url = 'http://api.mediastack.com/v1/news'

params = {
    'access_key': 'ad0657133e24010278b9e52debe35356',
    'countries': 'in',  # India
    'languages': 'en',
    'categories': 'business',  # Just business, don't use negatives
    'sort': 'published_desc',
    'limit': 100
}

response = requests.get(url, params=params)
data = response.json()

In [13]:
data['data'][8]

{'author': None,
 'title': 'Lenskart IPO Allotment LIVE: GMP falls; Know step-by-step guide to check share allotment status online - livemint.com',
 'description': "Lenskart IPO Allotment LIVE: GMP falls; Know step-by-step guide to check share allotment status online&nbsp;&nbsp;livemint.comLenskart IPO GMP falls sharply before listing. Is it heading for a weak debut?&nbsp;&nbsp;India TodaySoha Ali Khan simplifies stock market buzz around Lenskart’s Rs 7278 crore IPO, 'If you’re hoping to get&nbsp;&nbsp;Times of IndiaLIVE: Lenskart IPO allotment to be finalised, GMP up 10% – Step-by-step guide to check status online&nbsp;&nbsp;financialexpress.comLenskart IPO Allotment Live Updates: How to check status online on MUFG Intime India, BSE, NSE&nbsp;&nbsp;...",
 'url': 'https://news.google.com/rss/articles/CBMihwJBVV95cUxQZF96Z0ZzWWtzMGs4SG1DWXBtWG1rRTUxZHh2bFdhTlhDRVBDNXVLTzRaXzNUMjJUZXFlNEpHaExCSTJXZ3Q0MUZLMXNlTzRpRHdMNW9aRWJ0TnM3Mi1uSjRlcXZseDZEbktMMTFRZUNMOWllMm1xTmk2U1lJamVUNU5pLUM0ZFM3

In [4]:
# Check if successful
if 'data' in data:
    articles = data['data']
    print(f"Found {len(articles)} articles\n")
    
    for i, article in enumerate(articles[:5], 1):
        print(f"{i}. {article['title']}")
        print(f"   Source: {article['source']}")
        print(f"   Published: {article['published_at']}\n")
else:
    print("Error:", data)

Found 100 articles

1. Should a few public sector banks be merged with State Bank of India (SBI) to create a big, world-class bank? - BusinessLine
   Source: Google News Business IN
   Published: 2025-11-07T20:56:05+00:00

2. Stock Market LIVE Updates: Sensex, Nifty trade flat as metals pack outshines - Moneycontrol
   Source: Google News Business IN
   Published: 2025-11-07T09:08:18+00:00

3. Stock Market LIVE: Sensex drops 510 pts; Talking to stakeholders on MF charges, says Sebi chairman - Business Standard
   Source: Google News Business IN
   Published: 2025-11-07T05:21:00+00:00

4. Stock Market LIVE: Sensex drops 560 pts; Talking to stakeholders on MF charges, says Sebi chairman - Business Standard
   Source: Google News Business IN
   Published: 2025-11-07T05:11:51+00:00

5. SBI, French partner Amundi to divest 10% in SBI Funds via IPO
   Source: The Economic Times
   Published: 2025-11-07T01:30:51+00:00



### Newsdata.io and newspaper3k

In [3]:
from newspaper import Article
from newsdataapi import NewsDataApiClient
api = NewsDataApiClient(apikey='pub_965468a202be412d80928d294b632639')
# step 1: get articles from API
response = api.latest_api(
    q='stock market India economy finance',
    country='in',
    category='business',
    max_result=10
)
articles = response.get('results',[])

In [20]:

full_articles = []

# step 2: scrape full content from each URL
for idx, article in enumerate(articles, 1):
    try:
        print(f"Processing {idx}/{len(articles)}: {article['title'][:50]}...")

        # get full content
        news_article = Article(article['link'])
        news_article.download()
        news_article.parse()
        full_articles.append({
            'title': article['title'],
            'description': article['description'],
            'source': article['source_name'],
            'url': article['link'],
            'pubDate': article['pubDate'],
            'category': article['category'],
            'full_content': news_article.text,
            'authors': ', '.join(news_article.authors),
            'image_url': article['image_url']
        })
    except Exception as e:
        print(f"Error scraping {article['link']}: {e}")

Processing 1/6: Nifty 50, Sensex: Mr Perma Bull takes some market ...
Processing 2/6: Business cycle funds: Hit or miss?...
Processing 3/6: Business News | Bajaj Finserv Banking and Financia...
Processing 4/6: Bajaj Finserv Banking and Financial Services Fund ...
Processing 5/6: Editorial: Big banks, bigger questions...
Processing 6/6: Bajaj Finserv Banking and Financial Services Fund ...


In [30]:
import pandas as pd
import os
import urllib.parse

from newspaper import Article
from newsdataapi import NewsDataApiClient

In [77]:
def newsdata_connect():
    try:
        api = NewsDataApiClient(apikey='pub_965468a202be412d80928d294b632639')
        print('connected to API')
        return api
    except Exception as e:
        print("Not connected to API")
        return None
    

def extract_news(query, limit:int=5, country:str='in'):
    api = newsdata_connect()
    # step 1: get articles from API
    # if query==None:
    #     query = 'stock market India economy finance NSE BSE'
    # # query = query.strip()
    # print(f"Using query: '{query}'")
    # print(type(query))

    try:
        response = api.latest_api(
            q=query,
            country=country,
            category='top',
            size=limit,
            language='en'
        )
        articles = response.get('results',[])
        print('reached here')
        print(articles)
    except Exception as e:
        print("No articles returned")
        return []

    # step 2: scrape full content from each URL
    full_articles = []
    for idx, article in enumerate(articles, 1):
        print(idx)
        try:
            print(f"Processing {idx}/{len(articles)}: {article['title'][:50]}...")
            url = article.get('link') # article['link']
            if not url:
                print("Skipping: No url")
                continue
            # get full content
            news_article = Article(url)
            news_article.download()
            news_article.parse()
            full_articles.append({
                'article_id': article['article_id'],
                'title': article['title'],
                'description': article['description'],
                'source': article['source_name'],
                'url': article['link'],
                'pubDate': article['pubDate'],
                'category': article['category'],
                'full_content': news_article.text,
                'authors': ', '.join(news_article.authors),
                'image_url': article['image_url']
            })
        except Exception as e:
            print(f"Error scraping {article['link']}: {e}")
    return full_articles

def save_newsdata(full_articles, data_dir):
    os.makedirs(data_dir, exist_ok=True)
    filepath = os.path.join(data_dir, 'news_articles.parquet')
    if os.path.exists(filepath):
        os.remove(filepath)
    full_articles.to_parquet(filepath, compression='brotli')

In [78]:
# List of multiple phrases
queries = ["Hospital", "Nifty 50", "Tata Motors"]

# Combine queries into a single OR-separated string with quotes
combined_query = " OR ".join([f'"{q}"' for q in queries])
# query = query.strip()
full_articles = extract_news(query=combined_query, limit=5)


connected to API
reached here
[{'article_id': '42c00821355d99693a3cb31103c106db', 'link': 'https://timesofindia.indiatimes.com/city/bhubaneswar/bmc-plans-commercial-complex-convention-centre-in-chandrasekharpur/articleshow/125257493.cms', 'title': 'BMC plans commercial complex & convention centre in Chandrasekharpur', 'description': None, 'content': 'ONLY AVAILABLE IN PAID PLANS', 'keywords': ['chandrasekharpur convention centre', 'commercial complex in bhubaneswar', 'urban infrastructure development', 'convention centre project', 'bhubaneswar municipal corporation'], 'creator': ['Sandip Mishra'], 'language': 'english', 'country': ['india'], 'category': ['business', 'top'], 'pubDate': '2025-11-11 23:36:53', 'pubDateTZ': 'UTC', 'image_url': 'https://static.toiimg.com/thumb/msid-125257491,width-1070,height-580,imgsize-62048,resizemode-75,overlay-toi_sw,pt-32,y_pad-40/photo.jpg', 'video_url': None, 'source_id': 'toi', 'source_name': 'The Times Of India', 'source_priority': 2178, 'source_u

In [13]:
from newspaper import Article

url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
article = Article(url)

In [14]:
article.download()

In [16]:
article.html
article.parse()

In [17]:
article.text

'By Leigh Ann Caldwell\n\nWASHINGTON (CNN) — Not everyone subscribes to a New Year’s resolution, but Americans will be required to follow new laws in 2014.\n\nSome 40,000 measures taking effect range from sweeping, national mandates under Obamacare to marijuana legalization in Colorado, drone prohibition in Illinois and transgender protections in California.\n\nAlthough many new laws are controversial, they made it through legislatures, public referendum or city councils and represent the shifting composition of American beliefs.\n\nFederal: Health care, of course, and vending machines\n\nThe biggest and most politically charged change comes at the federal level with the imposition of a new fee for those adults without health insurance.\n\nFor 2014, the penalty is either $95 per adult or 1% of family income, whichever results in a larger fine.\n\nThe Obamacare, of Affordable Care Act, mandate also requires that insurers cover immunizations and some preventive care.\n\nAdditionally, mil