In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from fake_useragent import UserAgent
from collections import deque

class News_crawler:
    def __init__(self, start_url, max_articles=20, max_depth=5, delay=1.5):
        self.start_url = start_url
        self.max_articles = max_articles
        self.max_depth = max_depth
        self.delay = delay
        self.visited_urls = set()
        self.articles = []
        self.ua = UserAgent()
        self.session = requests.Session()
        self.domain = urlparse(start_url).netloc
        self.queue = deque()

        #session headers
        self.session.headers.update({
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.google.com/',
            'DNT': '1',
        })

    def is_valid_url(self, url):
        parsed = urlparse(url)
        return (parsed.netloc == self.domain and url not in self.visited_urls and not any(ext in url.lower() for ext in ['.pdf', '.jpg', '.png']))

    def is_article_page(self, url, soup):
        article = soup.find('article')
        headline = soup.find('h1')
        date_published = soup.find('time') or soup.find('meta', property='article:published_time')
        return (article is not None or headline is not None or date_published is not None or '/article/' in url or any(seg in url for seg in ['/news/', '/story/', '/post/']))

    def extract_article_content(self, soup):
        article = (soup.find('article') or soup.find('div', class_=lambda x: x and 'article' in x.lower()) or soup.find('div', id=lambda x: x and 'content' in x.lower()) or soup.find('main'))
        if not article:
            return None
        for element in article.find_all(['script', 'style', 'nav', 'footer', 'aside', 'figure', 'img']):
            element.decompose()
        return article.get_text(separator='\n', strip=True)

    def process_page(self, url, depth):
        try:
            time.sleep(self.delay)
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            self.visited_urls.add(url)

            #check for article page
            if self.is_article_page(url, soup):
                content = self.extract_article_content(soup)
                if content:
                    title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "No title"
                    date = (soup.find('time') or soup.find('meta', property='article:published_time') or
                           soup.find('span', class_=lambda x: x and 'date' in x.lower()))
                    date = date.get('datetime') if hasattr(date, 'get') else date.get_text(strip=True) if date else "Unknown"

                    self.articles.append({
                        'url': url,
                        'title': title,
                        'date': date,
                        'content': content[:5000] + "..." if len(content) > 5000 else content,
                        'depth': depth
                    })
                    print(f"Article found at depth {depth}: {title[:50]}...")

            #max depth reached?
            if depth < self.max_depth and len(self.articles) < self.max_articles:
                for link in soup.find_all('a', href=True):
                    absolute_url = urljoin(self.start_url, link['href'])
                    if self.is_valid_url(absolute_url):
                        self.queue.append((absolute_url, depth + 1))

        except Exception as e:
            print(f"Error processing {url}: {str(e)}")

    def crawl(self):
        print(f"\nStarting crawl on {self.start_url}(max depth: {self.max_depth})")
        self.queue.append((self.start_url, 0))
        while self.queue and len(self.articles) < self.max_articles:
            url, depth = self.queue.popleft()
            if url not in self.visited_urls:
                self.process_page(url, depth)

        print(f"\nCrawling complete. Found {len(self.articles)} articles.")

    def save_results(self, filename="news_articles.json"):
        import json
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.articles, f, ensure_ascii=False, indent=2)
        print(f"Results saved to {filename}")

    def print_summary(self):
        print("\nCollected Articles Summary:")
        for i, article in enumerate(self.articles, 1):
            print(f"\n{i}. {article['title']}")
            print(f"   Depth: {article['depth']} | Date: {article['date']}")
            print(f"   URL: {article['url']}")
            print(f"   Preview: {article['content'][:100]}...")

if __name__ == "__main__":
    sites={
        'AP News': 'https://apnews.com',
        'PBS NewsHour': 'https://www.pbs.org/newshour/',
        'BBC News': 'https://www.bbc.com/news',
        'NPR News': 'https://www.npr.org/sections/news/'}
    print("Available crawler friendly news sites:")
    for i, (name, url) in enumerate(sites.items(), 1):
        print(f"{i}. {name} ({url})")
    choice = int(input("\nSelect a site to crawl (1-4): ")) - 1
    selected_url = list(sites.values())[choice]
    crawler = News_crawler(
        start_url=selected_url,
        max_articles=100,
        max_depth=5,
        delay=1.5
    )
    crawler.crawl()
    crawler.print_summary()
    crawler.save_results()

Available crawler friendly news sites:
1. AP News (https://apnews.com)
2. PBS NewsHour (https://www.pbs.org/newshour/)
3. BBC News (https://www.bbc.com/news)
4. NPR News (https://www.npr.org/sections/news/)

Starting crawl on https://www.bbc.com/news(max depth: 5)
Article found at depth 0: NewsNews...
Article found at depth 1: NewsNews...
Article found at depth 1: US envoy rejects Hamas claim that it has agreed to...
Article found at depth 1: BBC Sport...
Article found at depth 1: Business...
Article found at depth 1: Innovation...
Article found at depth 1: Culture...
Article found at depth 1: Arts...
Article found at depth 1: Travel...
Article found at depth 1: Earth...
Article found at depth 1: Audio...
Article found at depth 1: Video...
Article found at depth 1: Live Now...
Article found at depth 1: NewsNews...
Article found at depth 1: NewsNews...
Article found at depth 1: NewsNews...
Article found at depth 1: NewsNews...
Article found at depth 1: NewsNews...
Article found at depth

In [4]:
import json
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from collections import defaultdict
from statistics import mean

def load_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

# Fine-tuned models (replace with your own or pre-trained alternatives)
SECTOR_MODEL = "yiyanghkust/finbert-tone"  # Domain-specific BERT
SENTIMENT_MODEL = "ProsusAI/finbert"  # Financially-tuned sentiment

# Initialize models
tokenizer = AutoTokenizer.from_pretrained(SECTOR_MODEL)
sector_model = AutoModelForSequenceClassification.from_pretrained(SECTOR_MODEL)
sentiment_analyzer = pipeline("text-classification", model=SENTIMENT_MODEL)

# Define stable sectors (customize as needed)
STABLE_SECTORS = [
    "Technology", "Healthcare", "Energy",
    "Finance", "Environment", "Industrial"
]

# Classify sectors using BERT logits (stable predictions)
def classify_sector(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = sector_model(**inputs)
    logits = outputs.logits.detach().numpy()
    return STABLE_SECTORS[np.argmax(logits)]

# Analyze sentiment with FinBERT (economic focus)
def analyze_sentiment(text):
    result = sentiment_analyzer(text)[0]
    return 1 if result["label"] == "positive" else -1

# Process news with rolling averages (reduces noise)
def analyze_news_stable(news_items, window_size=5):
    sector_scores = defaultdict(list)

    for i, item in enumerate(news_items):
        sector = classify_sector(item["content"])
        sentiment = analyze_sentiment(item["content"])

        # Apply rolling window smoothing
        if len(sector_scores[sector]) >= window_size:
            sector_scores[sector].pop(0)
        sector_scores[sector].append(sentiment)

    # Calculate growth (smoothed)
    growth_estimates = {}
    for sector, scores in sector_scores.items():
        smoothed_score = mean(scores) * 2  # Scale to %
        growth_estimates[sector] = f"{smoothed_score:.1f}%"

    return growth_estimates

# Example usage
if __name__ == "__main__":
    news_data = load_data("news_articles.json")  # Your 100-item dataset
    growth_report = analyze_news_stable(news_data)

    print("Stable Sector Growth Projections:")
    for sector, growth in growth_report.items():
        print(f"{sector}: {growth}")

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1058 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (1058) must match the size of tensor b (512) at non-singleton dimension 1

In [6]:
import json
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from collections import defaultdict
from statistics import mean

# Load data with proper error handling
def load_data(file_path="news_articles.json"):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

# Initialize models with error handling
try:
    SECTOR_MODEL = "yiyanghkust/finbert-tone"
    SENTIMENT_MODEL = "ProsusAI/finbert"

    tokenizer = AutoTokenizer.from_pretrained(SECTOR_MODEL)
    sector_model = AutoModelForSequenceClassification.from_pretrained(SECTOR_MODEL)
    sentiment_analyzer = pipeline("text-classification", model=SENTIMENT_MODEL,
                               tokenizer=SENTIMENT_MODEL, device=-1)
except Exception as e:
    print(f"Error loading models: {str(e)}")
    raise

STABLE_SECTORS = ["Technology", "Healthcare", "Energy", "Finance", "Environment", "Industrial"]

def classify_sector(text):
    try:
        inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, max_length=512)
        outputs = sector_model(**inputs)
        return STABLE_SECTORS[np.argmax(outputs.logits.detach().numpy())]
    except Exception as e:
        print(f"Sector classification error: {str(e)}")
        return "Unknown"

def analyze_sentiment(text):
    try:
        truncated_text = " ".join(text.split()[:500])
        result = sentiment_analyzer(truncated_text)[0]
        return 1 if result["label"] == "positive" else -1
    except Exception as e:
        print(f"Sentiment analysis error: {str(e)}")
        return 0  # Neutral if error occurs

def analyze_news_stable(news_items, window_size=5):
    sector_scores = defaultdict(list)

    for item in news_items:
        if not isinstance(item, dict) or "content" not in item:
            continue

        sector = classify_sector(item["content"])
        sentiment = analyze_sentiment(item["content"])

        if len(sector_scores[sector]) >= window_size:
            sector_scores[sector].pop(0)
        sector_scores[sector].append(sentiment)

    growth_estimates = {}
    for sector, scores in sector_scores.items():
        if scores:  # Only calculate if we have data
            smoothed_score = mean(scores) * 2  # Scale to %
            growth_estimates[sector] = f"{smoothed_score:.1f}%"

    return growth_estimates

if __name__ == "__main__":
    news_data = load_data()
    if news_data:
        growth_report = analyze_news_stable(news_data)
        print("Stable Sector Growth Projections:")
        for sector, growth in growth_report.items():
            print(f"{sector}: {growth}")

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors


Sentiment analysis error: The size of tensor a (637) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of tensor a (637) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of tensor a (623) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of tensor a (746) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of tensor a (623) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of tensor a (594) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of tensor a (657) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of tensor a (618) must match the size of tensor b (512) at non-singleton dimension 1
Sentiment analysis error: The size of te

In [10]:
import json
import re
import math
from typing import Dict, List, Tuple
from dataclasses import dataclass
from collections import defaultdict, Counter

@dataclass
class SectorImpact:
    sector: str
    impact_percentage: float
    confidence_score: float
    key_factors: List[str]
    impact_type: str  # positive, negative, neutral

class NewsSectorAnalyzer:
    def __init__(self):
        # Comprehensive sector definitions with keywords and impact indicators
        self.sector_definitions = {
            'E-commerce/Online Retail': {
                'primary_keywords': ['e-commerce', 'online shopping', 'retail platform', 'marketplace', 'digital retail'],
                'secondary_keywords': ['consumer', 'shopping', 'discount', 'sales', 'online store', 'website'],
                'impact_indicators': {
                    'regulatory': ['regulation', 'law', 'compliance', 'fine', 'penalty', 'breach'],
                    'market': ['competition', 'market share', 'growth', 'decline', 'expansion'],
                    'operational': ['supply chain', 'logistics', 'delivery', 'customer service']
                }
            },
            'Fashion/Apparel': {
                'primary_keywords': ['fashion', 'clothing', 'apparel', 'textile', 'garment', 'fast fashion'],
                'secondary_keywords': ['design', 'manufacturing', 'brand', 'style', 'trend'],
                'impact_indicators': {
                    'manufacturing': ['factory', 'production', 'labor', 'working conditions', 'supply chain'],
                    'sustainability': ['environmental', 'sustainable', 'waste', 'carbon footprint'],
                    'market': ['sales', 'demand', 'consumer preference', 'pricing']
                }
            },
            'Technology/Digital Platforms': {
                'primary_keywords': ['technology', 'digital platform', 'software', 'app', 'website', 'AI'],
                'secondary_keywords': ['innovation', 'digital transformation', 'automation', 'data'],
                'impact_indicators': {
                    'regulatory': ['data protection', 'privacy', 'algorithm', 'content moderation'],
                    'market': ['user adoption', 'platform growth', 'digital services'],
                    'security': ['cybersecurity', 'data breach', 'privacy violation']
                }
            },
            'Financial Services': {
                'primary_keywords': ['banking', 'finance', 'payment', 'fintech', 'investment', 'lending'],
                'secondary_keywords': ['transaction', 'credit', 'insurance', 'wealth management'],
                'impact_indicators': {
                    'regulatory': ['financial regulation', 'compliance', 'central bank', 'monetary policy'],
                    'market': ['interest rates', 'credit risk', 'market volatility'],
                    'technology': ['digital banking', 'blockchain', 'cryptocurrency']
                }
            },
            'Healthcare': {
                'primary_keywords': ['healthcare', 'medical', 'pharmaceutical', 'hospital', 'health'],
                'secondary_keywords': ['patient', 'treatment', 'drug', 'medicine', 'clinical'],
                'impact_indicators': {
                    'regulatory': ['FDA approval', 'medical regulation', 'health policy'],
                    'innovation': ['medical technology', 'research', 'clinical trials'],
                    'access': ['healthcare access', 'cost', 'insurance coverage']
                }
            },
            'Manufacturing': {
                'primary_keywords': ['manufacturing', 'production', 'factory', 'industrial', 'assembly'],
                'secondary_keywords': ['automation', 'quality control', 'supply chain', 'raw materials'],
                'impact_indicators': {
                    'operational': ['production capacity', 'efficiency', 'cost reduction'],
                    'supply_chain': ['supplier', 'logistics', 'inventory', 'raw materials'],
                    'labor': ['workforce', 'employment', 'skills', 'automation']
                }
            },
            'Transportation/Logistics': {
                'primary_keywords': ['transportation', 'logistics', 'shipping', 'delivery', 'freight'],
                'secondary_keywords': ['supply chain', 'distribution', 'warehouse', 'fleet'],
                'impact_indicators': {
                    'infrastructure': ['roads', 'ports', 'airports', 'railways'],
                    'regulation': ['transportation policy', 'safety regulations', 'emissions'],
                    'technology': ['autonomous vehicles', 'route optimization', 'tracking']
                }
            },
            'Energy': {
                'primary_keywords': ['energy', 'power', 'electricity', 'renewable', 'oil', 'gas'],
                'secondary_keywords': ['solar', 'wind', 'nuclear', 'fossil fuel', 'grid'],
                'impact_indicators': {
                    'policy': ['energy policy', 'carbon tax', 'emissions regulations'],
                    'market': ['energy prices', 'demand', 'supply', 'capacity'],
                    'technology': ['clean energy', 'energy storage', 'smart grid']
                }
            },
            'Agriculture/Food': {
                'primary_keywords': ['agriculture', 'farming', 'food', 'crop', 'livestock'],
                'secondary_keywords': ['harvest', 'produce', 'agricultural', 'rural', 'farmer'],
                'impact_indicators': {
                    'climate': ['weather', 'drought', 'climate change', 'seasonal'],
                    'policy': ['agricultural policy', 'subsidies', 'trade agreements'],
                    'technology': ['precision farming', 'biotechnology', 'sustainable farming']
                }
            },
            'Real Estate': {
                'primary_keywords': ['real estate', 'property', 'housing', 'construction', 'development'],
                'secondary_keywords': ['residential', 'commercial', 'mortgage', 'rent', 'building'],
                'impact_indicators': {
                    'market': ['property prices', 'housing demand', 'construction activity'],
                    'policy': ['zoning laws', 'building codes', 'tax policy'],
                    'finance': ['mortgage rates', 'property investment', 'real estate financing']
                }
            }
        }

        # Impact calculation weights
        self.impact_weights = {
            'direct_mention': 0.4,
            'keyword_relevance': 0.3,
            'context_impact': 0.2,
            'sentiment_impact': 0.1
        }

        # Initialize stop words for text processing
        self.stop_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
            'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must',
            'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'
        }

    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text for analysis"""
        # Remove extra whitespaces and normalize
        text = re.sub(r'\s+', ' ', text.strip())
        # Convert to lowercase for consistent analysis
        return text.lower()

    def calculate_keyword_relevance(self, text: str, sector: str) -> float:
        """Calculate relevance score based on keyword matching"""
        text_lower = text.lower()
        sector_def = self.sector_definitions[sector]

        # Primary keywords have higher weight
        primary_score = sum(2 for keyword in sector_def['primary_keywords']
                          if keyword in text_lower)

        # Secondary keywords have lower weight
        secondary_score = sum(1 for keyword in sector_def['secondary_keywords']
                            if keyword in text_lower)

        # Impact indicators add context-specific relevance
        impact_score = 0
        for category, indicators in sector_def['impact_indicators'].items():
            impact_score += sum(1.5 for indicator in indicators if indicator in text_lower)

        total_score = primary_score + secondary_score + impact_score

        # Normalize score (max possible score estimation)
        max_possible = len(sector_def['primary_keywords']) * 2 + \
                      len(sector_def['secondary_keywords']) + \
                      sum(len(indicators) * 1.5 for indicators in sector_def['impact_indicators'].values())

        return min(total_score / max_possible, 1.0) if max_possible > 0 else 0

    def calculate_tf_idf_similarity(self, text1: str, text2: str) -> float:
        """Calculate TF-IDF similarity between two texts without sklearn"""

        def get_word_frequencies(text):
            words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
            words = [word for word in words if word not in self.stop_words and len(word) > 2]
            return Counter(words)

        def calculate_tf(word_freq, total_words):
            return {word: freq/total_words for word, freq in word_freq.items()}

        def calculate_idf(word, all_docs):
            containing_docs = sum(1 for doc in all_docs if word in doc)
            if containing_docs == 0:
                return 0
            return math.log(len(all_docs) / containing_docs)

        # Get word frequencies
        freq1 = get_word_frequencies(text1)
        freq2 = get_word_frequencies(text2)

        if not freq1 or not freq2:
            return 0.0

        # Calculate TF
        tf1 = calculate_tf(freq1, sum(freq1.values()))
        tf2 = calculate_tf(freq2, sum(freq2.values()))

        # Get all unique words
        all_words = set(tf1.keys()) | set(tf2.keys())
        docs = [freq1, freq2]

        # Calculate TF-IDF vectors
        tfidf1 = {}
        tfidf2 = {}

        for word in all_words:
            idf = calculate_idf(word, docs)
            tfidf1[word] = tf1.get(word, 0) * idf
            tfidf2[word] = tf2.get(word, 0) * idf

        # Calculate cosine similarity
        dot_product = sum(tfidf1[word] * tfidf2[word] for word in all_words)

        norm1 = math.sqrt(sum(val**2 for val in tfidf1.values()))
        norm2 = math.sqrt(sum(val**2 for val in tfidf2.values()))

        if norm1 == 0 or norm2 == 0:
            return 0.0

        return dot_product / (norm1 * norm2)

    def calculate_semantic_similarity(self, text: str, sector: str) -> float:
        """Calculate semantic similarity using custom TF-IDF implementation"""
        sector_def = self.sector_definitions[sector]

        # Create sector reference text
        sector_text = ' '.join(
            sector_def['primary_keywords'] +
            sector_def['secondary_keywords'] +
            [indicator for indicators in sector_def['impact_indicators'].values()
             for indicator in indicators]
        )

        return self.calculate_tf_idf_similarity(text, sector_text)

    def analyze_impact_sentiment(self, text: str, sector: str) -> Tuple[str, float]:
        """Analyze whether the impact is positive, negative, or neutral"""
        positive_indicators = [
            'growth', 'increase', 'expansion', 'success', 'improvement', 'benefit',
            'opportunity', 'boost', 'rise', 'gain', 'advance', 'progress'
        ]

        negative_indicators = [
            'decline', 'decrease', 'loss', 'breach', 'violation', 'fine', 'penalty',
            'crisis', 'problem', 'issue', 'concern', 'risk', 'threat', 'challenge',
            'drop', 'fall', 'cut', 'reduce', 'harm', 'damage'
        ]

        text_lower = text.lower()

        positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
        negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)

        if negative_count > positive_count:
            return 'negative', min(negative_count / (negative_count + positive_count + 1), 1.0)
        elif positive_count > negative_count:
            return 'positive', min(positive_count / (negative_count + positive_count + 1), 1.0)
        else:
            return 'neutral', 0.5

    def extract_key_factors(self, text: str, sector: str) -> List[str]:
        """Extract key factors that contribute to sector impact"""
        factors = []
        text_lower = text.lower()
        sector_def = self.sector_definitions[sector]

        # Check impact indicators
        for category, indicators in sector_def['impact_indicators'].items():
            found_indicators = [indicator for indicator in indicators if indicator in text_lower]
            if found_indicators:
                factors.extend(found_indicators)

        # Extract sentences containing sector-relevant information
        sentences = text.split('.')
        relevant_sentences = []

        for sentence in sentences[:5]:  # Limit to first 5 sentences for key factors
            sentence_lower = sentence.lower()
            if any(keyword in sentence_lower for keyword in
                   sector_def['primary_keywords'] + sector_def['secondary_keywords']):
                relevant_sentences.append(sentence.strip())

        return factors + relevant_sentences

    def calculate_confidence_score(self, keyword_relevance: float, semantic_similarity: float,
                                 direct_mentions: int) -> float:
        """Calculate confidence score for the sector impact assessment"""
        # Base confidence from relevance and similarity
        base_confidence = (keyword_relevance * 0.6 + semantic_similarity * 0.4)

        # Boost confidence with direct mentions
        mention_boost = min(direct_mentions * 0.1, 0.3)

        # Final confidence score
        confidence = min(base_confidence + mention_boost, 1.0)

        return confidence

    def analyze_single_article(self, article_data: Dict) -> List[SectorImpact]:
        """Analyze a single article and return sector impacts"""
        title = article_data.get('title', '')
        content = article_data.get('content', '')

        # Combine title and content for analysis (title has more weight)
        full_text = f"{title} {title} {content}"  # Title repeated for emphasis
        processed_text = self.preprocess_text(full_text)

        sector_impacts = []

        for sector in self.sector_definitions.keys():
            # Calculate various relevance scores
            keyword_relevance = self.calculate_keyword_relevance(processed_text, sector)
            semantic_similarity = self.calculate_semantic_similarity(processed_text, sector)

            # Count direct mentions of sector-related terms
            sector_terms = (self.sector_definitions[sector]['primary_keywords'] +
                          self.sector_definitions[sector]['secondary_keywords'])
            direct_mentions = sum(1 for term in sector_terms if term in processed_text)

            # Calculate overall impact percentage
            impact_percentage = (
                keyword_relevance * self.impact_weights['keyword_relevance'] +
                semantic_similarity * self.impact_weights['context_impact'] +
                min(direct_mentions * 0.1, 0.4) * self.impact_weights['direct_mention']
            ) * 100

            # Only include sectors with meaningful impact (threshold: 10%)
            if impact_percentage >= 10:
                # Analyze sentiment and extract factors
                impact_type, sentiment_strength = self.analyze_impact_sentiment(processed_text, sector)
                key_factors = self.extract_key_factors(content, sector)
                confidence = self.calculate_confidence_score(keyword_relevance, semantic_similarity, direct_mentions)

                # Adjust impact percentage based on sentiment strength
                adjusted_impact = impact_percentage * (0.7 + sentiment_strength * 0.3)

                sector_impacts.append(SectorImpact(
                    sector=sector,
                    impact_percentage=round(adjusted_impact, 2),
                    confidence_score=round(confidence, 3),
                    key_factors=key_factors[:5],  # Top 5 factors
                    impact_type=impact_type
                ))

        # Sort by impact percentage and return top impacts
        sector_impacts.sort(key=lambda x: x.impact_percentage, reverse=True)
        return sector_impacts[:5]  # Return top 5 impacted sectors

    def analyze_articles_batch(self, articles: List[Dict]) -> Dict:
        """Analyze multiple articles and provide comprehensive sector impact analysis"""
        all_impacts = []
        sector_aggregate = defaultdict(list)

        for i, article in enumerate(articles):
            print(f"Analyzing article {i+1}/{len(articles)}: {article.get('title', 'Untitled')[:50]}...")

            impacts = self.analyze_single_article(article)
            all_impacts.append({
                'article_title': article.get('title', 'Untitled'),
                'article_url': article.get('url', ''),
                'impacts': impacts
            })

            # Aggregate sector impacts
            for impact in impacts:
                sector_aggregate[impact.sector].append(impact.impact_percentage)

        # Calculate sector-wise statistics
        sector_summary = {}
        for sector, percentages in sector_aggregate.items():
            sector_summary[sector] = {
                'average_impact': round(sum(percentages) / len(percentages), 2),
                'max_impact': round(max(percentages), 2),
                'frequency': len(percentages),
                'total_articles': len(articles),
                'coverage_percentage': round((len(percentages) / len(articles)) * 100, 2)
            }

        return {
            'individual_analyses': all_impacts,
            'sector_summary': sector_summary,
            'total_articles_analyzed': len(articles)
        }

    def generate_report(self, analysis_results: Dict) -> str:
        """Generate a comprehensive analysis report"""
        report = []
        report.append("=" * 80)
        report.append("NEWS ARTICLE SECTOR IMPACT ANALYSIS REPORT")
        report.append("=" * 80)
        report.append(f"\nTotal Articles Analyzed: {analysis_results['total_articles_analyzed']}\n")

        # Sector Summary
        report.append("SECTOR IMPACT SUMMARY")
        report.append("-" * 40)

        # Sort sectors by average impact
        sorted_sectors = sorted(analysis_results['sector_summary'].items(),
                              key=lambda x: x[1]['average_impact'], reverse=True)

        for sector, stats in sorted_sectors:
            report.append(f"\n{sector}:")
            report.append(f"  Average Impact: {stats['average_impact']}%")
            report.append(f"  Maximum Impact: {stats['max_impact']}%")
            report.append(f"  Frequency: {stats['frequency']} articles")
            report.append(f"  Coverage: {stats['coverage_percentage']}% of all articles")

        # Top Individual Impacts
        report.append(f"\n\nTOP INDIVIDUAL ARTICLE IMPACTS")
        report.append("-" * 40)

        top_impacts = []
        for article_analysis in analysis_results['individual_analyses']:
            for impact in article_analysis['impacts']:
                top_impacts.append({
                    'title': article_analysis['article_title'],
                    'sector': impact.sector,
                    'impact': impact.impact_percentage,
                    'confidence': impact.confidence_score,
                    'type': impact.impact_type
                })

        top_impacts.sort(key=lambda x: x['impact'], reverse=True)

        for i, impact in enumerate(top_impacts[:10]):  # Top 10
            report.append(f"\n{i+1}. {impact['title'][:60]}...")
            report.append(f"   Sector: {impact['sector']}")
            report.append(f"   Impact: {impact['impact']}% ({impact['type']})")
            report.append(f"   Confidence: {impact['confidence']}")

        return "\n".join(report)

    def load_articles_from_json(self, file_path: str) -> List[Dict]:
        """Load articles from JSON file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # Handle different JSON structures
            if isinstance(data, list):
                # Direct list of articles
                articles = data
            elif isinstance(data, dict):
                # Check common keys for article lists
                if 'articles' in data:
                    articles = data['articles']
                elif 'data' in data:
                    articles = data['data']
                elif 'items' in data:
                    articles = data['items']
                else:
                    # Assume the dict values contain articles
                    articles = list(data.values())
            else:
                raise ValueError("Unsupported JSON structure")

            print(f"Successfully loaded {len(articles)} articles from {file_path}")
            return articles

        except FileNotFoundError:
            print(f"Error: File {file_path} not found")
            return []
        except json.JSONDecodeError as e:
            print(f"Error: Invalid JSON format - {e}")
            return []
        except Exception as e:
            print(f"Error loading articles: {e}")
            return []

    def save_results_to_json(self, results: Dict, output_file: str):
        """Save analysis results to JSON file"""
        try:
            # Convert dataclass objects to dictionaries for JSON serialization
            serializable_results = {
                'total_articles_analyzed': results['total_articles_analyzed'],
                'sector_summary': results['sector_summary'],
                'individual_analyses': []
            }

            for analysis in results['individual_analyses']:
                serializable_analysis = {
                    'article_title': analysis['article_title'],
                    'article_url': analysis['article_url'],
                    'impacts': [
                        {
                            'sector': impact.sector,
                            'impact_percentage': impact.impact_percentage,
                            'confidence_score': impact.confidence_score,
                            'key_factors': impact.key_factors,
                            'impact_type': impact.impact_type
                        }
                        for impact in analysis['impacts']
                    ]
                }
                serializable_results['individual_analyses'].append(serializable_analysis)

            with open(output_file, 'w', encoding='utf-8') as file:
                json.dump(serializable_results, file, indent=2, ensure_ascii=False)

            print(f"Results saved to {output_file}")

        except Exception as e:
            print(f"Error saving results: {e}")

# Main execution function
def analyze_news_articles(json_file_path: str, output_file: str = None):
    """
    Main function to analyze news articles from JSON file

    Args:
        json_file_path (str): Path to your JSON file containing articles
        output_file (str): Optional path to save results as JSON
    """

    # Initialize analyzer
    analyzer = NewsSectorAnalyzer()

    # Load articles from JSON file
    print("Loading articles from JSON file...")
    articles = analyzer.load_articles_from_json(json_file_path)

    if not articles:
        print("No articles found or failed to load articles")
        return

    print(f"Found {len(articles)} articles to analyze")
    print("Starting sector impact analysis...\n")

    # Analyze all articles
    results = analyzer.analyze_articles_batch(articles)

    # Generate and print report
    print("\n" + "="*80)
    print("GENERATING COMPREHENSIVE REPORT")
    print("="*80)

    report = analyzer.generate_report(results)
    print(report)

    # Save results to JSON if output file specified
    if output_file:
        analyzer.save_results_to_json(results, output_file)

    # Print summary statistics
    print(f"\n\nSUMMARY STATISTICS")
    print("-" * 40)
    print(f"Total articles analyzed: {results['total_articles_analyzed']}")
    print(f"Total sectors identified: {len(results['sector_summary'])}")

    # Top 3 most impacted sectors
    top_sectors = sorted(results['sector_summary'].items(),
                        key=lambda x: x[1]['average_impact'], reverse=True)[:3]

    print(f"\nTop 3 Most Impacted Sectors:")
    for i, (sector, stats) in enumerate(top_sectors, 1):
        print(f"{i}. {sector}: {stats['average_impact']}% avg impact")

    return results

# Example usage functions
def main():
    """
    Example usage - Replace 'your_articles.json' with your actual file path
    """
    # Replace with your actual JSON file path
    json_file_path = "news_articles.json"  # <-- CHANGE THIS TO YOUR FILE PATH
    output_file = "sector_analysis_results.json"  # Optional: save results

    # Analyze articles
    results = analyze_news_articles(json_file_path, output_file)

    if results:
        print("\nAnalysis completed successfully!")
        print(f"Check '{output_file}' for detailed JSON results")

# Alternative: Direct analysis function for custom usage
def analyze_custom_articles():
    """
    Custom analysis function - modify as needed
    """
    # Option 1: Specify your JSON file path directly
    file_path = input("news_articles.json").strip()

    # Option 2: Save results option
    save_results = input("Save results to JSON file? (y/n): ").strip().lower()
    output_file = None
    if save_results == 'y':
        output_file = input("final_analysis.json").strip()

    # Run analysis
    analyze_news_articles(file_path, output_file)

if __name__ == "__main__":
    print("News Article Sector Impact Analyzer")
    print("="*50)
    print("1. Run with default settings (modify main() function)")
    print("2. Run with custom file path input")

    choice = input("Choose option (1 or 2): ").strip()

    if choice == "1":
        main()
    elif choice == "2":
        analyze_custom_articles()
    else:
        print("Invalid choice. Running default analysis...")
        main()

News Article Sector Impact Analyzer
1. Run with default settings (modify main() function)
2. Run with custom file path input
Loading articles from JSON file...
Successfully loaded 100 articles from news_articles.json
Found 100 articles to analyze
Starting sector impact analysis...

Analyzing article 1/100: NewsNews...
Analyzing article 2/100: NewsNews...
Analyzing article 3/100: US envoy rejects Hamas claim that it has agreed to...
Analyzing article 4/100: BBC Sport...
Analyzing article 5/100: Business...
Analyzing article 6/100: Innovation...
Analyzing article 7/100: Culture...
Analyzing article 8/100: Arts...
Analyzing article 9/100: Travel...
Analyzing article 10/100: Earth...
Analyzing article 11/100: Audio...
Analyzing article 12/100: Video...
Analyzing article 13/100: Live Now...
Analyzing article 14/100: NewsNews...
Analyzing article 15/100: NewsNews...
Analyzing article 16/100: NewsNews...
Analyzing article 17/100: NewsNews...
Analyzing article 18/100: NewsNews...
Analyzing art

In [9]:
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import json
import re
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import math

# Configuration
@dataclass
class ModelConfig:
    vocab_size: int = 30522
    hidden_size: int = 768
    num_attention_heads: int = 12
    num_hidden_layers: int = 6
    intermediate_size: int = 3072
    max_position_embeddings: int = 512
    dropout_prob: float = 0.1
    num_sectors: int = 8
    lora_rank: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.1

# LoRA Layer for Parameter-Efficient Fine-tuning
class LoRALayer(nn.Module):
    def __init__(self, in_features: int, out_features: int, rank: int = 16, alpha: int = 32, dropout: float = 0.1):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank

        # LoRA parameters
        self.lora_A = nn.Parameter(torch.randn(in_features, rank) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
        self.dropout = nn.Dropout(dropout)

        # Original layer (frozen during LoRA training)
        self.original_layer = nn.Linear(in_features, out_features)

    def forward(self, x):
        # Original forward pass
        original_output = self.original_layer(x)

        # LoRA forward pass
        lora_output = self.dropout(x) @ self.lora_A @ self.lora_B * self.scaling

        return original_output + lora_output

    def freeze_original(self):
        """Freeze original layer parameters for LoRA training"""
        for param in self.original_layer.parameters():
            param.requires_grad = False

# Multi-Head Attention with LoRA
class MultiHeadAttentionLoRA(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.scale = math.sqrt(self.head_dim)

        # LoRA layers for Q, K, V projections
        self.query = LoRALayer(config.hidden_size, config.hidden_size, config.lora_rank, config.lora_alpha)
        self.key = LoRALayer(config.hidden_size, config.hidden_size, config.lora_rank, config.lora_alpha)
        self.value = LoRALayer(config.hidden_size, config.hidden_size, config.lora_rank, config.lora_alpha)
        self.output = LoRALayer(config.hidden_size, config.hidden_size, config.lora_rank, config.lora_alpha)

        self.dropout = nn.Dropout(config.dropout_prob)

    def forward(self, hidden_states, attention_mask=None):
        batch_size, seq_len, hidden_size = hidden_states.shape

        # Generate Q, K, V
        Q = self.query(hidden_states).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(hidden_states).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(hidden_states).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Attention scores
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if attention_mask is not None:
            attention_scores += attention_mask

        attention_probs = F.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)

        # Apply attention to values
        context = torch.matmul(attention_probs, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_size)

        # Output projection
        output = self.output(context)
        return output, attention_probs

# Feed Forward Network with LoRA
class FeedForwardLoRA(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.dense1 = LoRALayer(config.hidden_size, config.intermediate_size, config.lora_rank, config.lora_alpha)
        self.dense2 = LoRALayer(config.intermediate_size, config.hidden_size, config.lora_rank, config.lora_alpha)
        self.dropout = nn.Dropout(config.dropout_prob)
        self.activation = nn.GELU()

    def forward(self, hidden_states):
        hidden_states = self.dense1(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense2(hidden_states)
        return hidden_states

# Transformer Layer
class TransformerLayer(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.attention = MultiHeadAttentionLoRA(config)
        self.feed_forward = FeedForwardLoRA(config)
        self.norm1 = nn.LayerNorm(config.hidden_size)
        self.norm2 = nn.LayerNorm(config.hidden_size)
        self.dropout = nn.Dropout(config.dropout_prob)

    def forward(self, hidden_states, attention_mask=None):
        # Self-attention with residual connection
        attention_output, attention_probs = self.attention(self.norm1(hidden_states), attention_mask)
        hidden_states = hidden_states + self.dropout(attention_output)

        # Feed forward with residual connection
        ff_output = self.feed_forward(self.norm2(hidden_states))
        hidden_states = hidden_states + self.dropout(ff_output)

        return hidden_states, attention_probs

# News Sector Classification Model
class NewsSectorModel(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config

        # Embeddings
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.norm = nn.LayerNorm(config.hidden_size)
        self.dropout = nn.Dropout(config.dropout_prob)

        # Transformer layers
        self.layers = nn.ModuleList([
            TransformerLayer(config) for _ in range(config.num_hidden_layers)
        ])

        # Classification heads
        self.sector_classifier = nn.Linear(config.hidden_size, config.num_sectors)
        self.confidence_head = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size // 2),
            nn.GELU(),
            nn.Dropout(config.dropout_prob),
            nn.Linear(config.hidden_size // 2, 1),
            nn.Sigmoid()
        )

        # Temperature scaling for calibration
        self.temperature = nn.Parameter(torch.ones(1))

        self.init_weights()

    def init_weights(self):
        """Initialize weights"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, std=0.02)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, std=0.02)

    def freeze_for_lora(self):
        """Freeze all parameters except LoRA parameters"""
        for name, param in self.named_parameters():
            if 'lora_' not in name and 'sector_classifier' not in name and 'confidence_head' not in name:
                param.requires_grad = False

        # Freeze original layers in LoRA components
        for layer in self.layers:
            layer.attention.query.freeze_original()
            layer.attention.key.freeze_original()
            layer.attention.value.freeze_original()
            layer.attention.output.freeze_original()
            layer.feed_forward.dense1.freeze_original()
            layer.feed_forward.dense2.freeze_original()

    def forward(self, input_ids, attention_mask=None, return_attention=False):
        batch_size, seq_len = input_ids.shape

        # Create position ids
        position_ids = torch.arange(seq_len, device=input_ids.device).expand((batch_size, -1))

        # Embeddings
        token_embeds = self.token_embeddings(input_ids)
        position_embeds = self.position_embeddings(position_ids)
        hidden_states = self.norm(token_embeds + position_embeds)
        hidden_states = self.dropout(hidden_states)

        # Create attention mask
        if attention_mask is not None:
            attention_mask = attention_mask[:, None, None, :] * -10000.0

        # Pass through transformer layers
        all_attention_probs = []
        for layer in self.layers:
            hidden_states, attention_probs = layer(hidden_states, attention_mask)
            if return_attention:
                all_attention_probs.append(attention_probs)

        # Global average pooling for classification
        if attention_mask is not None:
            mask = (attention_mask.squeeze() != -10000.0).float().unsqueeze(-1)
            pooled = (hidden_states * mask).sum(dim=1) / mask.sum(dim=1)
        else:
            pooled = hidden_states.mean(dim=1)

        # Classification outputs
        sector_logits = self.sector_classifier(pooled)
        confidence_score = self.confidence_head(pooled)

        # Temperature scaling for calibration
        calibrated_logits = sector_logits / self.temperature

        outputs = {
            'sector_logits': calibrated_logits,
            'confidence_score': confidence_score,
            'pooled_output': pooled
        }

        if return_attention:
            outputs['attention_probs'] = all_attention_probs

        return outputs

# Dataset for News Articles
class NewsDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], tokenizer, max_length: int = 512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Training utilities
class NewsClassificationTrainer:
    def __init__(self, model, config, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model.to(device)
        self.config = config
        self.device = device
        self.sector_names = [
            'Technology', 'Finance', 'Healthcare', 'Energy',
            'Real Estate', 'Consumer', 'Industrial', 'Other'
        ]

    def train_epoch(self, dataloader, optimizer, criterion):
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch in dataloader:
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['label'].to(self.device)

            optimizer.zero_grad()

            outputs = self.model(input_ids, attention_mask)

            # Classification loss
            classification_loss = criterion(outputs['sector_logits'], labels)

            # Confidence regularization loss
            confidence_loss = torch.mean((outputs['confidence_score'] - 0.5) ** 2)

            total_loss_batch = classification_loss + 0.1 * confidence_loss
            total_loss_batch.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            optimizer.step()

            total_loss += total_loss_batch.item()
            _, predicted = torch.max(outputs['sector_logits'], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        return total_loss / len(dataloader), correct / total

    def evaluate(self, dataloader):
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        all_predictions = []
        all_confidences = []
        all_labels = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = self.model(input_ids, attention_mask)

                # Get predictions and confidences
                probabilities = F.softmax(outputs['sector_logits'], dim=-1)
                _, predicted = torch.max(outputs['sector_logits'], 1)

                all_predictions.extend(predicted.cpu().numpy())
                all_confidences.extend(outputs['confidence_score'].cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        return accuracy, all_predictions, all_confidences, all_labels

    def predict_single(self, text: str, tokenizer) -> Dict:
        """Predict sector and confidence for a single news article"""
        self.model.eval()

        # Tokenize input
        encoding = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.config.max_position_embeddings,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask, return_attention=True)

            # Get sector probabilities
            probabilities = F.softmax(outputs['sector_logits'], dim=-1)
            confidence = outputs['confidence_score'].item()

            # Get predictions
            predicted_sector_idx = torch.argmax(probabilities, dim=-1).item()
            sector_percentages = probabilities.cpu().numpy()[0]

            result = {
                'predicted_sector': self.sector_names[predicted_sector_idx],
                'confidence_score': confidence,
                'sector_percentages': {
                    sector: float(percentage)
                    for sector, percentage in zip(self.sector_names, sector_percentages)
                },
                'top_3_sectors': [
                    {
                        'sector': self.sector_names[idx],
                        'percentage': float(sector_percentages[idx])
                    }
                    for idx in np.argsort(sector_percentages)[-3:][::-1]
                ]
            }

        return result

# Example usage and training setup
def create_sample_data():
    """Create sample news data for demonstration"""
    sample_texts = [
        "Apple announces new iPhone with advanced AI capabilities and improved camera technology",
        "Federal Reserve raises interest rates by 0.25% to combat inflation concerns",
        "New cancer treatment shows promising results in clinical trials",
        "Oil prices surge amid geopolitical tensions in the Middle East",
        "Housing market shows signs of cooling as mortgage rates climb",
        "Consumer spending drops as inflation impacts household budgets",
        "Manufacturing output increases for third consecutive month",
        "Local weather forecast predicts sunny skies for the weekend"
    ]

    sample_labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Technology, Finance, Healthcare, Energy, Real Estate, Consumer, Industrial, Other

    return sample_texts, sample_labels

def main():
    """Main training and demonstration function"""
    # Initialize configuration
    config = ModelConfig()

    # Create model
    model = NewsSectorModel(config)

    # Initialize trainer
    trainer = NewsClassificationTrainer(model, config)

    # Create sample data
    texts, labels = create_sample_data()

    print("News Sector Classification Model initialized successfully!")
    print(f"Model has {sum(p.numel() for p in model.parameters())} total parameters")
    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

    # Demonstrate LoRA fine-tuning setup
    model.freeze_for_lora()
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Parameters after LoRA freeze: {trainable_params} ({trainable_params/sum(p.numel() for p in model.parameters())*100:.2f}%)")

    # Example prediction (requires tokenizer - using placeholder)
    sample_text = "Tech giant announces breakthrough in quantum computing research"
    print(f"\nSample prediction for: '{sample_text}'")
    print("Note: Full prediction requires proper tokenizer initialization")

    return model, trainer, config

if __name__ == "__main__":
    model, trainer, config = main()

In [23]:
import json
import spacy
from transformers import pipeline
from collections import defaultdict
from datetime import datetime
import argparse
import pandas as pd
import os
import sys

# Initialize NLP components
nlp = spacy.load("en_core_web_sm")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define sectors (GICS classification)
SECTORS = [
    "Energy", "Materials", "Industrials", "Consumer Discretionary",
    "Consumer Staples", "Health Care", "Financials", "Information Technology",
    "Communication Services", "Utilities", "Real Estate", "Transportation"
]

def load_articles(json_file):
    """Load articles from JSON file"""
    if not os.path.isfile(json_file):
        print(f"ERROR: Input file '{json_file}' does not exist.")
        sys.exit(1)
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    if not isinstance(data, list):
        data = [data]
    return data

def analyze_article(article):
    """Analyze a single article"""
    content = article.get('content')
    if not content:
        print(f"Skipping article without content: {article.get('title', 'No title')}")
        return None

    doc = nlp(content)

    # Sector classification
    classification = classifier(
        content[:1000],  # Truncate to avoid max length issues
        candidate_labels=SECTORS,
        multi_label=True
    )

    # Sentiment analysis (simple version)
    positive_words = ["growth", "rise", "increase", "profit", "gain"]
    negative_words = ["fall", "decline", "loss", "drop", "explosion", "crash"]
    sentiment_score = 0
    for token in doc:
        if token.text.lower() in positive_words:
            sentiment_score += 1
        elif token.text.lower() in negative_words:
            sentiment_score -= 1

    # Extract entities
    entities = set()
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT', 'GPE']:
            entities.add(ent.text)

    # Parse date safely
    date_str = article.get('date')
    if not date_str:
        print(f"Skipping article without date: {article.get('title', 'No title')}")
        return None
    try:
        date_parsed = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
    except Exception as e:
        print(f"Skipping article with invalid date '{date_str}': {article.get('title', 'No title')}")
        return None

    return {
        'date': date_parsed,
        'top_sector': classification['labels'][0],
        'sector_confidence': classification['scores'][0],
        'sentiment': sentiment_score,
        'entities': list(entities),
        'title': article.get('title', 'No title')
    }

def calculate_growth_metrics(analyzed_articles):
    """Calculate growth metrics by sector"""
    sector_data = defaultdict(lambda: {
        'count': 0,
        'sentiments': [],
        'entities': set(),
        'titles': []
    })

    for article in analyzed_articles:
        sector = article['top_sector']
        sector_data[sector]['count'] += 1
        sector_data[sector]['sentiments'].append(article['sentiment'])
        sector_data[sector]['entities'].update(article['entities'])
        sector_data[sector]['titles'].append(article['title'])

    # Calculate time-based growth (last 3 days vs previous 3 days)
    now = datetime.now()
    recent_cutoff = now - pd.Timedelta(days=3)

    results = []
    for sector, data in sector_data.items():
        # Time-based analysis
        recent_count = sum(1 for article in analyzed_articles
                         if article['top_sector'] == sector
                         and article['date'] >= recent_cutoff)
        previous_count = data['count'] - recent_count

        # Growth rate calculation
        growth_rate = ((recent_count - previous_count) / (previous_count + 0.001)) * 100

        # Sentiment analysis
        avg_sentiment = sum(data['sentiments']) / len(data['sentiments']) if data['sentiments'] else 0

        # Entity diversity
        entity_diversity = len(data['entities'])

        # Recent article titles (for context)
        recent_titles = [title for article in analyzed_articles
                        if article['top_sector'] == sector
                        and article['date'] >= recent_cutoff
                        for title in [article['title']]]

        results.append({
            'Sector': sector,
            'Total Articles': data['count'],
            'Growth Rate (%)': round(growth_rate, 2),
            'Avg Sentiment': round(avg_sentiment, 2),
            'Unique Entities': entity_diversity,
            'Recent Titles': recent_titles[:3]  # Show top 3 recent titles
        })

    return sorted(results, key=lambda x: x['Growth Rate (%)'], reverse=True)

def main():
    parser = argparse.ArgumentParser(description='Analyze news articles for sector growth')
    parser.add_argument('input_file', help='news_articles.json')
    parser.add_argument('--output', help='results.json', default='sector_growth.csv')
    args, unknown = parser.parse_known_args()  # safe for Jupyter

    print(f"Loading articles from {args.input_file}...")
    articles = load_articles(args.input_file)
    if not articles:
        print("No articles loaded. Please check your input file.")
        sys.exit(1)

    print(f"Loaded {len(articles)} articles")
    print(f"Sample article keys: {list(articles[0].keys())}")

    print("Analyzing articles...")
    analyzed_articles = [analyze_article(article) for article in articles]
    analyzed_articles = [a for a in analyzed_articles if a is not None]

    if not analyzed_articles:
        print("No valid articles to analyze after filtering. Exiting.")
        sys.exit(1)

    print("Calculating growth metrics...")
    growth_analysis = calculate_growth_metrics(analyzed_articles)

    # Save results
    df = pd.DataFrame(growth_analysis)
    if args.output.endswith('.json'):
        df.to_json(args.output, orient='records', indent=2)
    else:
        df.to_csv(args.output, index=False)

    print(f"Analysis complete. Results saved to {args.output}")
    print("\nTop Growing Sectors:")
    print(df.head(5).to_string(index=False))

if __name__ == "__main__":
    main()


Device set to use cpu


Loading articles from C:\Users\Krinal\AppData\Roaming\jupyter\runtime\kernel-bda4135e-b98c-406e-8cf2-736b3cf48b30.json...
Loaded 1 articles
Sample article keys: ['shell_port', 'iopub_port', 'stdin_port', 'control_port', 'hb_port', 'ip', 'key', 'transport', 'signature_scheme', 'kernel_name', 'jupyter_session']
Analyzing articles...
Skipping article without content: No title
No valid articles to analyze after filtering. Exiting.


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [19]:
!pip install hf_xet

Collecting hf_xet
  Downloading hf_xet-1.1.2-cp37-abi3-win_amd64.whl.metadata (883 bytes)
Downloading hf_xet-1.1.2-cp37-abi3-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 2.7/2.7 MB 14.4 MB/s eta 0:00:00
Installing collected packages: hf_xet
Successfully installed hf_xet-1.1.2


In [9]:
import spacy
from spacy.matcher import PhraseMatcher
import json
from collections import defaultdict
import pandas as pd

# Load English language model
nlp = spacy.load("en_core_web_lg")

class SectorAnalyzer:
    def __init__(self):
        # Define sector keywords and related terms
        self.sector_keywords = {
            "Energy": ["oil", "gas", "renewable", "solar", "wind", "energy", "petroleum", "OPEC", "drilling"],
            "Technology": ["tech", "software", "AI", "artificial intelligence", "chip", "semiconductor", "cloud"],
            "Healthcare": ["pharma", "hospital", "medical", "biotech", "vaccine", "FDA", "healthcare"],
            "Finance": ["bank", "investment", "stock", "market", "crypto", "bitcoin", "interest rate"],
            "Manufacturing": ["factory", "manufacturing", "production", "supply chain", "automobile"],
            "Retail": ["retail", "e-commerce", "amazon", "walmart", "consumer", "shopping"],
            "Transportation": ["airline", "shipping", "logistics", "trucking", "aviation"],
            "Real Estate": ["housing", "real estate", "mortgage", "property", "REIT"],
            "Agriculture": ["farm", "agriculture", "crop", "grain", "livestock"]
        }

        # Initialize phrase matcher
        self.matcher = PhraseMatcher(nlp.vocab)
        for sector, patterns in self.sector_keywords.items():
            patterns = [nlp(text) for text in patterns]
            self.matcher.add(sector, None, *patterns)

    def identify_sectors(self, text):
        doc = nlp(text)
        matches = self.matcher(doc)

        sector_counts = defaultdict(int)
        for match_id, start, end in matches:
            sector = nlp.vocab.strings[match_id]
            sector_counts[sector] += 1

        return sector_counts

    def predict_growth(self, sector_counts, article_date, text=None):
    # Base growth rates (would normally come from economic models)
        base_growth = {
            "Energy": {"short": 2.5, "long": 15},
            "Technology": {"short": 8, "long": 45},
            "Healthcare": {"short": 5, "long": 30},
            "Finance": {"short": 3, "long": 20},
            "Manufacturing": {"short": 2, "long": 12},
            "Retail": {"short": 4, "long": 18},
            "Transportation": {"short": 3.5, "long": 22},
            "Real Estate": {"short": 1.5, "long": 10},
            "Agriculture": {"short": 2, "long": 15}
        }

        # Initialize sentiment factor (default neutral)
        sentiment_factor = 0
        if text:  # Only analyze sentiment if text is provided
            sentiment_factor = self.analyze_sentiment(text)

        results = {}
        for sector, count in sector_counts.items():
            if count > 1:  # Only consider sectors with multiple mentions
                base = base_growth.get(sector, {"short": 2, "long": 10})
                adjusted_short = base["short"] * (1 + sentiment_factor*0.1)
                adjusted_long = base["long"] * (1 + sentiment_factor*0.2)
                results[sector] = {
                    "confidence": min(100, count*15),  # Scale with mention count
                    "short_term_growth": round(adjusted_short, 1),
                    "long_term_growth": round(adjusted_long, 1),
                    "trend_indicator": "↑" if sentiment_factor > 0 else "↓" if sentiment_factor < 0 else "→"
                }

        return results

    def analyze_sentiment(self, text):
        """Simple sentiment analysis (would use better model in production)"""
        doc = nlp(text)
        positive_words = ["growth", "boom", "rise", "increase", "surge"]
        negative_words = ["decline", "fall", "drop", "crisis", "slump"]

        score = 0
        for token in doc:
            if token.text.lower() in positive_words:
                score += 1
            elif token.text.lower() in negative_words:
                score -= 1

        return score / len(doc) * 100 if doc else 0

# Example Usage
if __name__ == "__main__":
    analyzer = SectorAnalyzer()

    # Load your JSON data
    with open('NewsArticles.json', 'r', encoding='utf-8') as f:
        articles = json.load(f)

    sector_report = defaultdict(list)
# In your main processing loop:
    for article in articles:
        article_text = article["title"] + " " + article["content"]
        sectors = analyzer.identify_sectors(article_text)
        growth_predictions = analyzer.predict_growth(
            sectors,
            article["date"],
            text=article_text  # Pass the text for sentiment analysis
        )

        if growth_predictions:
            primary_sector = max(growth_predictions.items(), key=lambda x: x[1]["confidence"])[0]
            sector_report[primary_sector].append({
                "article_title": article["title"],
                "url": article["url"],
                "confidence": growth_predictions[primary_sector]["confidence"],
                "short_term": growth_predictions[primary_sector]["short_term_growth"],
                "long_term": growth_predictions[primary_sector]["long_term_growth"],
                "trend": growth_predictions[primary_sector]["trend_indicator"]
            })

    # Generate sector growth summary
    summary = {}
    for sector, articles in sector_report.items():
        avg_short = sum(a["short_term"] for a in articles) / len(articles)
        avg_long = sum(a["long_term"] for a in articles) / len(articles)
        summary[sector] = {
            "article_count": len(articles),
            "average_short_term_growth": round(avg_short, 1),
            "average_long_term_growth": round(avg_long, 1),
            "sample_articles": [a["article_title"] for a in articles[:3]]
        }

    print("\nSector Growth Potential Summary:")
    print(pd.DataFrame.from_dict(summary, orient="index"))

    # Save detailed report
    with open("SectorAnalysisReport.json", "w") as f:
        json.dump({
            "summary": summary,
            "detailed_analysis": sector_report
        }, f, indent=2)


Sector Growth Potential Summary:
               article_count  average_short_term_growth  \
Technology                 1                        8.0   
Manufacturing              1                        2.0   
Energy                     1                        2.5   
Finance                    2                        3.0   

               average_long_term_growth  \
Technology                         45.0   
Manufacturing                      12.4   
Energy                             14.7   
Finance                            20.0   

                                                 sample_articles  
Technology                               [Taiwan News & Opinion]  
Manufacturing                              [Hong Kong Originals]  
Energy                                                 [Opinion]  
Finance        [HKFP Lens, Hong Kong Free Press Transparency ...  


In [11]:
 import json
from transformers import pipeline
from collections import defaultdict

# Load Zero-Shot classifier
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device="cpu"
)

# Define your sectors
sectors = [
    "Disasters & Accidents",
    "Politics & Government",
    "Entertainment & Celebrity",
    "Technology",
    "Health",
    "Energy & Environment",
    "Business & Finance",
    "Religion",
    "Social Issues"
]

def classify_article(text, confidence_threshold=0.7):
    """Classify article text into sectors"""
    result = classifier(text, sectors, multi_label=True)
    return [
        sector for sector, score in zip(result["labels"], result["scores"])
        if score > confidence_threshold
    ]

# Load your JSON data
with open('NewsArticles.json', 'r', encoding='utf-8') as f:
    articles = json.load(f)

# Categorize articles
sector_articles = defaultdict(list)

for article in articles:
    # Combine title and content for better classification
    text = f"{article['title']}. {article['content']}"

    # Get relevant sectors
    article_sectors = classify_article(text)

    # Add to each relevant sector bucket
    for sector in article_sectors:
        sector_articles[sector].append(article)

# Save results
with open('categorized_articles.json', 'w') as f:
    json.dump(sector_articles, f, indent=2)

print("Categorization complete. Results saved to categorized_articles.json")

Device set to use cpu


Categorization complete. Results saved to categorized_articles.json
