# Exploration on 
1) Sentiment analysis of financial news
2) Analysts' rating
3) LLM to interpret technical analysis

In [None]:
import requests
import pickle
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datetime import datetime, timedelta
import yfinance as yf
import numpy as np
from scipy.stats import norm
import finnhub
from typing import Dict, Any
import random


import warnings
warnings.filterwarnings("ignore")

In [2]:
# Covers 9 major GICS sectors (IT, Communication Services, Consumer Discretionary, Financials, Health Care, Energy, Consumer Staples, Industrials, Utilities, Materials).
# Includes mega-cap leaders (AAPL, MSFT, AMZN) and sector representatives (DUK for utilities, NEM for materials).
# Balanced enough for cross-sectional hypothesis tests on sentiment vs. returns.

tickers = [
    'AAPL', 'MSFT', 'GOOGL', 'META', 'AMZN', 'TSLA', 'JPM', 'MS', 'UNH', 'PFE', \
    'XOM', 'CVX', 'PG', 'KO', 'CAT', 'UNP', 'NEE', 'DUK', 'LIN', 'NEM', \
    'NVDA', 'CSCO', 'NFLX', 'DIS', 'HD', 'MCD', 'BAC', 'WFC', 'JNJ', 'LLY', \
    'COP', 'SLB', 'PEP', 'WMT', 'GE', 'MMM', 'SO', 'AEP', 'SHW', 'FCX',\
    'AVGO', 'INTC', 'CHTR', 'TMUS', 'NKE', 'SBUX', 'AXP', 'BLK', 'MRK', 'ABBV',\
    'MPC', 'PSX', 'KMB', 'K', 'HON', 'LMT', 'D', 'EXC', 'DD', 'ALB',\
    'QCOM', 'ORCL', 'WBD', 'EA', 'LOW', 'BKNG', 'C', 'SCHW', 'BMY', 'GILD', \
    'EOG', 'HAL', 'KHC', 'ADM', 'RTX', 'DE', 'ETR', 'XEL', 'NUE', 'PPG',\
    'AMAT', 'ADBE', 'ROKU', 'TTWO', 'TGT', 'MAR', 'PRU', 'GS', 'AMGN', 'MRNA', \
    'COP', 'OXY', 'CL', 'TSN', 'FDX', 'ITW', 'SRE', 'PPL', 'STLD', 'MOS',\
    'USB', 'PYPL', 'HUM', 'CME', 'REGN', 'VLO', 'MDLZ', 'CAT', 'NOW', 'INTU', \
    'VZ', 'KR', 'CMG', 'CI', 'PH', 'BKR', 'NOC', 'IP', 'MLM', 'SNAP',\
    'ADI', 'KLAC', 'D', 'LYV', 'TJX', 'BBWI', 'PNC', 'ICE', 'DXCM', 'EW', \
    'APA', 'DVN', 'EL', 'BG', 'CMI', 'ROK', 'NI', 'ATO', 'AVY', 'BALL',\
    'MU', 'T', 'PINS', 'DLTR', 'EXPE', 'BAX', 'JCI', 'DOV', 'PEG', 'PKG', 'CF',\
    'STX', 'CDNS', 'WMG', 'LVS', 'ALL', 'AON', 'ZBH', 'EMN', 'ISRG', 'FOXA', 'YUM' \
    ]

# 1. Sentiment Analysis

In [None]:
# use a model from HuggingFace
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()



RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [83]:
def sentiment_percentile(headline):
    # Run sentiment analysis
    # for headline in news_headlines:
    inputs = tokenizer(headline, return_tensors="pt")
    outputs = model(**inputs)
    # probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    probs = torch.softmax(outputs.logits, dim=-1)[0].tolist()

    labels = ["negative", "neutral", "positive"]
    pred_idx = torch.argmax(outputs.logits, dim=-1).item()
    pred_label = labels[pred_idx]
    confidence = probs[pred_idx]

    # Map to percentile scale
    if pred_label == "negative":
        percentile = (1-confidence) * 0.3333
    elif pred_label == "neutral":
        if probs[0] > probs[2]:  # leaning negative
            percentile = 0.3334 + confidence/2 * (0.5 - 0.3334)
        else:  ## leaning positive
            percentile = 0.5 + confidence/2 * (0.6667 - 0.5)
    else:  # positive
        percentile = 0.6667 + confidence * (1.0 - 0.6667)

    return {
        "headline": headline,
        "sentiment": pred_label,
        "confidence": round(confidence, 4),
        "percentile": round(percentile, 4)
    }

with open("seekingalpha_session_id.pkl", "rb") as handle:
    sa_session_id = pickle.load(handle)
    
with open("seekingalpha_api_key.pkl", "rb") as handle:
    sa_api_key = pickle.load(handle)

# Replace with your RapidAPI key
API_KEY = sa_api_key
BASE_HOST = "seeking-alpha.p.rapidapi.com"

headers = {
    "X-RapidAPI-Key": API_KEY,
    "X-RapidAPI-Host": BASE_HOST
}

def fetch_news_list(symbol, page=1, size=5):
    url = f"https://{BASE_HOST}/news/v2/list-by-symbol"
    params = {"id": symbol, "page": str(page), "size": str(size)}
    r = requests.get(url, headers=headers, params=params)
    r.raise_for_status()
    return r.json()

def fetch_detail(item):
    item_id = item["id"]
    item_type = item.get("type")

    if item_type == "news":
        url = f"https://{BASE_HOST}/news/v2/get-details"
    elif item_type == "article":
        url = f"https://{BASE_HOST}/articles/v2/get-details"
    else:
        return None

    params = {"id": str(item_id)}
    r = requests.get(url, headers=headers, params=params)
    if r.status_code == 404:
        return None
    r.raise_for_status()
    return r.json()

In [125]:
def round_up_time(x, interval):
    if (x.minute//interval + 1)*interval == 60:
        return x.replace(hour = x.hour+1, minute = 0, second=0)
    else:
        return x.replace(minute = (x.minute//interval + 1)*interval, second = 0)

In [117]:
def get_date():
    
    """
    Returns today's date in the format year-month-day
    """
    
    return datetime.today().strftime('%Y-%m-%d')

def get_stock_data(stock_name, period='2y', interval='1h'):
    
    """
    Uses the yahoo finance API to extract historical stock price data for a given stock
    and a given starting date
    """
    
    data = yf.Ticker(stock_name).history(period=period, interval=interval)
    data.reset_index(inplace=True)
    data["Datetime"] = data["Datetime"].apply(lambda x: str(x)[:-6])
    data["Datetime"] = pd.to_datetime(data["Datetime"])
    data['timestamp'] = data['Datetime'].dt.to_pydatetime()
    data['target'] = data['Close'].pct_change()*10000
    data = data.dropna().reset_index(drop=True)
    return data

def get_stock_data_start_end(stock_name, start, end, interval='1h'):
    
    """
    Uses the yahoo finance API to extract historical stock price data for a given stock
    and a given starting date
    """
    
    data = yf.Ticker(stock_name).history(start = start, end = end, interval=interval)
    data.reset_index(inplace=True)
    data["Datetime"] = data["Datetime"].apply(lambda x: str(x)[:-6])
    data["Datetime"] = pd.to_datetime(data["Datetime"])
    data['timestamp'] = data['Datetime'].dt.to_pydatetime()
    data['target'] = data['Close'].pct_change()*10000
    data = data.dropna().reset_index(drop=True)
    return data

In [382]:
# Example usage
def get_sentiment_with_return(ticker, interval):
    resp = fetch_news_list(ticker, page=10, size=200)
    news_dict = {}

    for item in resp.get("data", []):
        title = item["attributes"].get("title")
        dt = item["attributes"].get("publishOn")
        sent_score = sentiment_percentile(title)['percentile']
        news_dict[dt] = [title, sent_score]
    news_df = pd.DataFrame(news_dict).transpose()
    news_df.columns = ['news', 'score']

    news_df.insert(0, 'Datetime', pd.to_datetime(news_df.index.str[:19]))
    news_df['Datetime'] = news_df['Datetime'].apply(lambda x: round_up_time(x, interval))
    news_df.sort_values('Datetime', inplace=True)

    if ((news_df['Datetime'].dt.date.max() - news_df['Datetime'].dt.date.min()).days>60):
        stock = get_stock_data_start_end(ticker, start = news_df['Datetime'].dt.date.max() + timedelta(days=-25), end = news_df['Datetime'].dt.date.max(), interval=str(interval)+'m')
    else:
        stock = get_stock_data_start_end(ticker, start = news_df['Datetime'].dt.date.min(), end = news_df['Datetime'].dt.date.max() + timedelta(days=1), interval=str(interval)+'m')

    df = news_df[['Datetime', 'news', 'score']].merge(stock[['Datetime', 'Close']], left_on='Datetime', right_on='Datetime', how='left')
    df.sort_values('Datetime', inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.ffill(inplace=True)
    df['pos'] = df['score'].apply(lambda x: -1 if (x<0.3334) else (1 if x > 0.6666 else 0))
    df['cumu_ret'] = ((df['Close'].pct_change().shift(-1) * df['pos']).cumsum()+1).cumprod()-1

    return news_df, stock, df

## Hypothesis test: using sentiment score does yield positive return

In [None]:
# delta: effect, how far above 0 you want to detect
# sigma: variability of your data
# power = 1-beta: probability of detecting the effect if it exists (commonly 80%)
def required_n_one_sided(alpha=0.05, power=0.8, sigma=0.02, delta=0.0005):
    z_alpha = norm.ppf(1 - alpha)      # 1.645 for alpha=0.05
    z_beta  = norm.ppf(power)          # 0.842 for power=0.8
    return ((z_alpha + z_beta)**2 * sigma**2) / (delta**2)

def ticker_years_needed(n_required, p_positive=0.5, years=1, days_per_year=252):
    obs_per_ticker_year = p_positive * days_per_year
    return n_required / obs_per_ticker_year / years

In [348]:
# 
n = required_n_one_sided(alpha=0.05, power=0.8, sigma=0.008, delta=0.0005)
tickers_for_1yr = ticker_years_needed(n, p_positive=0.5, years=0.08)
print("Required observations:", int(np.ceil(n)))
print("Approx tickers for 22 trading days:", int(np.ceil(tickers_for_1yr)))

Required observations: 1583
Approx tickers for 22 trading days: 158


In [None]:
# 
stock_return = []
for tk in tickers:
    news_df, stock, df = get_sentiment_with_return(tk, 2)
    stock_return.append([tk, df['cumu_ret'].dropna().iloc[-1]])
stk_df = pd.DataFrame(stock_return)
stk_df.columns = ['Ticker', 'Return']

In [None]:
# Bootstrap with Hypothesis testing
n = len(stk_df['Return'])
observed_mean = np.mean(stk_df['Return'])

# Bootstrap
n_boot = 10000
boot_means = []
for _ in range(n_boot):
    sample = np.random.choice(stk_df['Return'], size=int(n*0.8), replace=True)
    boot_means.append(np.mean(sample))

boot_means = np.array(boot_means)

# One-sided p-value: probability mean <= 0
p_value = np.mean(boot_means <= 0)

print("Observed mean:", observed_mean)
print("Bootstrap p-value (H0: mean <= 0):", p_value)

Observed mean: 0.17664461679627103
Bootstrap p-value (H0: mean <= 0): 0.0063


Since p_value is less than 0.05, we could reject the null hypothesis, i.e. accpet that using sentiment to trade stocks does yield positive returns.

# 2. Analysts' Rating

In [40]:
with open("finnhub_api_key.pkl", "rb") as handle:
    finnhub_api_key = pickle.load(handle)
finnhub_client = finnhub.Client(api_key=finnhub_api_key)

def get_recommendation_trends(ticker):
    data = finnhub_client.recommendation_trends(ticker)

    df = pd.DataFrame(data)
    df.rename(columns={
        "period": "Date",
        "strongBuy": "Strong Buy",
        "buy": "Buy",
        "hold": "Hold",
        "sell": "Sell",
        "strongSell": "Strong Sell"
    }, inplace=True)

    df['Score'] = (df['Strong Buy'] * 2 + df['Buy'] + df['Sell'] * (-1) + df['Strong Sell'] * (-2))/df[['Strong Buy', 'Buy', 'Hold', 'Sell', 'Strong Sell']].sum(axis=1)
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    
    return df[['Date', 'Strong Buy', 'Buy', 'Hold', 'Sell', 'Strong Sell', 'Score']]

## Hypothesis test (H0: Excess rerurn is smaller or equal to 0.)

In [None]:
ret = []
for tk in tickers:    
    rec = get_recommendation_trends(tk)
    stock = yf.Ticker(tk).history(start = rec['Date'].min(), end = rec['Date'].max() + timedelta(days = 31), interval='1mo')
    stock.index = pd.to_datetime(stock.index)
    stock.reset_index(drop=False, inplace=True)
    stock['Date'] = stock['Date'].dt.date

    df = stock[['Date','Close']].merge(rec[['Date', 'Score']], left_on='Date', right_on='Date', how='left')
    df['LongOnly_Ret'] = (df['Close'].pct_change().shift(-1) * abs(df['Score']).mean()+1).cumprod()-1
    df['Strat_Ret'] = (df['Close'].pct_change().shift(-1) * round(df['Score'],2)+1).cumprod()-1

    exc_ret = (df['Strat_Ret'].dropna().iloc[-1] - df['LongOnly_Ret'].dropna().iloc[-1]+1)**(12/(df.shape[0]-1))-1
    ret.append([tk, exc_ret])

ret_df = pd.DataFrame(ret)
ret_df.columns = ['Ticker', 'ExcessReturn']

In [58]:
n = len(ret_df['ExcessReturn'])
observed_mean = np.mean(ret_df['ExcessReturn'])

# Bootstrap
n_boot = 10000
boot_means = []
for _ in range(n_boot):
    sample = np.random.choice(ret_df['ExcessReturn'], size=n, replace=True)
    boot_means.append(np.mean(sample))

boot_means = np.array(boot_means)

# One-sided p-value: probability mean <= 0
p_value = np.mean(boot_means <= 0)

print("Observed mean:", observed_mean)
print("Bootstrap p-value (H0: mean <= 0):", p_value)

Observed mean: -0.0024268233959812077
Bootstrap p-value (H0: mean <= 0): 0.7281


Can't reject the null hypothesis, so will disregard this trading signal.

# 3. LLM on Technical Chart Analysis

In [3]:
from ta.momentum import RSIIndicator
from ta.trend import SMAIndicator
from openai import OpenAI    # or your preferred client

In [None]:
def make_caption(df):
    # compute simple indicators
    df = df.copy()
    df['sma20'] = SMAIndicator(df['Close'], window=20).sma_indicator()
    df['sma50'] = SMAIndicator(df['Close'], window=50).sma_indicator()
    df['rsi'] = RSIIndicator(df['Close'], window=14).rsi()

    last = df.iloc[-1]
    window_start = df['Date'].iloc[0]
    window_end = df['Date'].iloc[-1]
    ret = (last['Close']/df.iloc[0]['Close'] - 1) * 100

    caption = (
        f"Window: {window_start} to {window_end} ({len(df)} bars)\n"
        f"Price: open {df.iloc[0]['Open']:.2f}, close {last['Close']:.2f}, high {df['High'].max():.2f}, low {df['Low'].min():.2f}\n"
        f"Return: {ret:.2f}% over window\n"
        f"Trend: sma20 {last['sma20']:.2f} vs sma50 {last['sma50']:.2f}\n"
        f"Momentum: RSI(14)={last['rsi']:.1f}\n"
    )
    # add simple divergence detection (toy)
    caption += detect_simple_divergence(df)
    return caption

def detect_simple_divergence(df):
    # naive: compare last high vs previous high and RSI
    highs = df['High']
    rsi = df['rsi']
    idx_highs = highs.nlargest(2).index.tolist()
    if len(idx_highs) >= 2:
        latest, prev = idx_highs[0], idx_highs[1]
        if highs.iloc[latest] > highs.iloc[prev] and rsi.iloc[latest] < rsi.iloc[prev]:
            return "Anomaly: bearish divergence detected between price highs and RSI.\n"
    return ""

In [None]:
class DeepSeekTechnicalAnalyst:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.deepseek.com/v1/chat/completions"  # Correct API endpoint
        self.system_prompt = """You are a technical analyst specializing in stock market analysis. 
        Analyze the provided structured caption data and provide a technical bias assessment.

        Respond with ONLY a JSON object in this exact format:
        {
            "score": -0.7,
            "label": "Bearish",
            "reasoning": "Brief technical reasoning here"
        }

        Scoring scale:
        - +1.0 to +0.7: Very Bullish
        - +0.7 to +0.3: Bullish  
        - +0.3 to -0.3: Neutral
        - -0.3 to -0.7: Bearish
        - -0.7 to -1.0: Very Bearish

        Analyze based on:
        - Price position relative to moving averages
        - RSI momentum readings
        - Trend structure
        - Support/resistance levels"""

    def call_deepseek_api(self, user_message: str) -> Dict[str, Any]:
        """Make API call to DeepSeek LLM"""
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": "deepseek-chat",  # or "deepseek-coder" depending on your needs
            "messages": [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": user_message}
            ],
            "temperature": 0.1,
            "max_tokens": 500,
            "response_format": {"type": "json_object"}  # Force JSON response
        }
        
        try:
            response = requests.post(self.base_url, headers=headers, json=payload)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"API Error: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"Response status: {e.response.status_code}")
                print(f"Response body: {e.response.text}")
            return None

    def analyze_technical_data(self, caption: str) -> Dict[str, Any]:
        """Main analysis function using DeepSeek API"""
        user_prompt = f"""
        Analyze this technical data and provide bias assessment:

        {caption}
        """

        api_response = self.call_deepseek_api(user_prompt)
        
        if api_response is None:
            return {
                "score": 0.0,
                "label": "API Error",
                "reasoning": "Failed to get analysis from API"
            }
        
        try:
            # Extract the JSON response from the API
            llm_response = api_response["choices"][0]["message"]["content"]
            analysis = json.loads(llm_response)
            return analysis
        except (KeyError, IndexError, json.JSONDecodeError) as e:
            return {
                "score": 0.0,
                "label": "Parse Error",
                "reasoning": f"Could not parse API response: {e}"
            }
        
# Alternative: Simple function for quick use
def analyze_stock_caption(caption: str) -> Dict[str, Any]:
    with open("deepseek_api_key.pkl", "rb") as handle:
        api_key = pickle.load(handle)
    """One-line function to analyze stock technical data"""
    analyst = DeepSeekTechnicalAnalyst(api_key)
    return analyst.analyze_technical_data(caption)

In [None]:
sel_ticker = random.choices(tickers, k=3)
ta_dfs = []
for tk in sel_ticker:
    stock = yf.Ticker(tk).history(start = '2025-05-01', end = '2026-01-01', interval='1d')
    stock.drop(['Dividends', 'Stock Splits'], axis=1, inplace=True)
    stock.reset_index(drop=False, inplace=True)
    stock['Date'] = pd.to_datetime(stock['Date']).dt.date
    caption = make_caption(stock)

    score = []
    for i in range(55, len(stock)):
        stock_prior = stock.loc[:i-1].copy()
        caption = make_caption(stock_prior)
        result = analyze_stock_caption(caption)
        score.append([stock.loc[i, 'Date'], result['score']])
    score_df = pd.DataFrame(score)
    score_df.columns = ['Date', 'Score']
    score_df['Date'] = pd.to_datetime(score_df['Date']).dt.date

    df = score_df.merge(stock[['Date', 'Close']], left_on='Date', right_on='Date', how='left')
    df['Ret'] = (df['Close'].pct_change() * df['Score']+1).cumprod()-1
    df['LongOnly_Ret'] = (df['Close'].pct_change() * abs(df['Score']).mean() + 1).cumprod()-1

    ta_dfs.append(df)

## Hypothesis test (H0: the excess return of technical analysis is smaller or equal to 0.)

In [None]:
ta_df = pd.DataFrame(columns = ['Ret', 'LongOnly_Ret'])
for i in range(len(ta_dfs)):
    ta_df = pd.concat([ta_df, ta_dfs[i][['Ret', 'LongOnly_Ret']]], axis=0)
ta_df['ExcessReturn'] = ta_df['Ret'] - ta_df['LongOnly_Ret']

In [90]:
n = len(ta_df['ExcessReturn'])
observed_mean = np.mean(ta_df['ExcessReturn'])

# Bootstrap
n_boot = 10000
boot_means = []
for _ in range(n_boot):
    sample = np.random.choice(ta_df['ExcessReturn'], size=int(n*0.6), replace=True)
    boot_means.append(np.mean(sample))

boot_means = np.array(boot_means)

# One-sided p-value: probability mean <= 0
p_value = np.mean(boot_means <= 0)

print("Observed mean:", observed_mean)
print("Bootstrap p-value (H0: mean <= 0):", p_value)

Observed mean: -0.035843172515338396
Bootstrap p-value (H0: mean <= 0): 0.1603


Fail to reject H0, i.e. using LLM for technical does NOT improve the return.