In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
# Uncomment the line below if you don't have the 'punkt' tokenizer data
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from sklearn.feature_extraction.text import TfidfVectorizer

# --- 1. Data Loading (Mock Data Generation) ---
# In a real scenario, you would load your CSV file like this:
# price = pd.read_csv('your_financial_news_data.csv')

# For demonstration, let's create a mock DataFrame similar to your CSV structure.
data = {
    'date': pd.to_datetime([
        '2023-01-01', '2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02',
        '2023-01-02', '2023-01-03', '2023-01-03', '2023-01-03', '2023-01-04',
        '2023-01-04', '2023-01-04', '2023-01-05', '2023-01-05', '2023-01-05',
        '2023-01-06', '2023-01-06', '2023-01-06', '2023-01-07', '2023-01-07',
        '2023-01-07', '2023-01-08', '2023-01-08', '2023-01-08', '2023-01-09',
        '2023-01-09', '2023-01-09', '2023-01-10', '2023-01-10', '2023-01-10',
        '2023-01-11', '2023-01-11', '2023-01-11', '2023-01-12', '2023-01-12',
        '2023-01-12', '2023-01-13', '2023-01-13', '2023-01-13', '2023-01-14',
        '2023-01-14', '2023-01-14', '2023-01-15', '2023-01-15', '2023-01-15',
    ]),
    'title_article': [
        "Tech giant announces record profits", "New regulations hit energy sector", "Healthcare startup secures funding",
        "Market optimistic on tech stocks", "Oil prices surge on supply concerns", "Biotech breakthrough unveiled",
        "Tech stocks show slight dip", "Energy sector faces headwinds", "Healthcare mergers on the rise",
        "Tech innovation drives growth", "OPEC+ decision impacts oil", "Pharma company in clinical trials",
        "Major tech acquisition hinted", "Renewable energy gains traction", "Healthcare policy changes discussed",
        "Tech earnings surprise positively", "Gas prices stabilize", "Medical device approval expected",
        "Tech sector sees further gains", "Energy stocks rebound", "Healthcare costs under scrutiny",
        "Another tech acquisition", "Oil production cut", "New drug approval",
        "Tech valuations questioned", "Geopolitical tensions in oil markets", "Hospital chain expands",
        "Chip shortage impacts tech", "Green energy investments soar", "Telemedicine booming",
        "Big Tech faces antitrust probe", "Coal industry struggles", "Vaccine trials show promise",
        "AI advancements in tech", "Solar energy capacity grows", "Drug pricing debates",
        "Social media company under fire", "Oil price volatility continues", "Mental health services grow",
        "Metaverse focus for tech", "Natural gas demand spikes", "Genomic research progress",
        "Cloud computing strong", "Commodity prices soften", "Insurance company reports loss",
    ],
    'named_entity': [
        'Apple', 'ExxonMobil', 'Teladoc', 'Apple', 'ExxonMobil', 'Teladoc',
        'Apple', 'ExxonMobil', 'Teladoc', 'Apple', 'ExxonMobil', 'Teladoc',
        'Apple', 'ExxonMobil', 'Teladoc', 'Apple', 'ExxonMobil', 'Teladoc',
        'Apple', 'ExxonMobil', 'Teladoc', 'Apple', 'ExxonMobil', 'Teladoc',
        'Apple', 'ExxonMobil', 'Teladoc', 'Apple', 'ExxonMobil', 'Teladoc',
        'Apple', 'ExxonMobil', 'Teladoc', 'Apple', 'ExxonMobil', 'Teladoc',
        'Apple', 'ExxonMobil', 'Teladoc', 'Apple', 'ExxonMobil', 'Teladoc',
        'Apple', 'ExxonMobil', 'Teladoc',
    ],
    # Sentiment scores ranging from -1 (very negative) to 1 (very positive)
    'sentiment': [
        0.8, -0.7, 0.5, 0.6, 0.4, 0.7, 0.1, -0.2, 0.6, 0.7, -0.1, 0.8,
        0.9, 0.6, 0.3, 0.95, 0.0, 0.85, 0.8, 0.5, -0.1, 0.9, -0.8, 0.9,
        -0.3, -0.6, 0.7, -0.9, 0.8, 0.9, -0.7, -0.9, 0.95, 0.8, 0.7, -0.2,
        -0.6, 0.1, 0.5, 0.7, 0.6, 0.85, 0.8, 0.3, -0.4,
    ]
}
df = pd.DataFrame(data)

print("Original DataFrame (first 5 rows):")
print(df.head())
print("\n" + "="*50 + "\n")



Original DataFrame (first 5 rows):
        date                        title_article named_entity  sentiment
0 2023-01-01  Tech giant announces record profits        Apple        0.8
1 2023-01-01    New regulations hit energy sector   ExxonMobil       -0.7
2 2023-01-01   Healthcare startup secures funding      Teladoc        0.5
3 2023-01-02     Market optimistic on tech stocks        Apple        0.6
4 2023-01-02  Oil prices surge on supply concerns   ExxonMobil        0.4




In [2]:
# --- 2. Detecting a "Sentiment Shift" ---

# Aggregate daily sentiment for each named entity
daily_sentiment = df.groupby(['date', 'named_entity'])['sentiment'].mean().reset_index()
daily_sentiment = daily_sentiment.sort_values(by=['named_entity', 'date']).reset_index(drop=True)

print("Daily Average Sentiment per Entity (first 5 rows):")
print(daily_sentiment.head())
print("\n" + "="*50 + "\n")

# Calculate rolling average and rolling standard deviation for sentiment
# These help us understand the typical sentiment and its variability over time.
# We'll use a 7-day rolling window for the average and a 30-day window for standard deviation.
# min_periods ensures we only calculate when enough data points are available.

# Calculate rolling average sentiment
daily_sentiment['rolling_avg_sentiment_7d'] = daily_sentiment.groupby('named_entity')['sentiment'] \
                                                    .rolling(window=7, min_periods=5).mean().reset_index(level=0, drop=True)

# Calculate rolling standard deviation of sentiment
daily_sentiment['rolling_std_sentiment_30d'] = daily_sentiment.groupby('named_entity')['sentiment'] \
                                                    .rolling(window=30, min_periods=10).std().reset_index(level=0, drop=True)

# Define a threshold for detecting a "shift"
# A shift is detected if the current daily sentiment deviates from the rolling average
# by more than a certain number of standard deviations.
# We'll use 1.5 standard deviations as a threshold for demonstration.
std_dev_threshold = 1.5

# Calculate the deviation from the rolling average
daily_sentiment['deviation'] = abs(daily_sentiment['sentiment'] - daily_sentiment['rolling_avg_sentiment_7d'])

# Flag a sentiment shift if deviation is above the threshold * rolling_std_sentiment_30d
daily_sentiment['sentiment_shift'] = np.where(
    (daily_sentiment['deviation'] > (std_dev_threshold * daily_sentiment['rolling_std_sentiment_30d'])) &
    (daily_sentiment['rolling_std_sentiment_30d'].notna()), # Ensure rolling std is not NaN
    1, # Indicates a shift
    0  # No shift
)

print(f"Daily Sentiment with Rolling Averages, Std Dev, and Shift Flags (first 15 rows for one entity):")
# Filter for a specific entity to better visualize the rolling calculations
print(daily_sentiment[daily_sentiment['named_entity'] == 'Apple'].head(15))
print("\n" + "="*50 + "\n")



Daily Average Sentiment per Entity (first 5 rows):
        date named_entity  sentiment
0 2023-01-01        Apple        0.8
1 2023-01-02        Apple        0.6
2 2023-01-03        Apple        0.1
3 2023-01-04        Apple        0.7
4 2023-01-05        Apple        0.9


Daily Sentiment with Rolling Averages, Std Dev, and Shift Flags (first 15 rows for one entity):
         date named_entity  sentiment  rolling_avg_sentiment_7d  \
0  2023-01-01        Apple       0.80                       NaN   
1  2023-01-02        Apple       0.60                       NaN   
2  2023-01-03        Apple       0.10                       NaN   
3  2023-01-04        Apple       0.70                       NaN   
4  2023-01-05        Apple       0.90              6.200000e-01   
5  2023-01-06        Apple       0.95              6.750000e-01   
6  2023-01-07        Apple       0.80              6.928571e-01   
7  2023-01-08        Apple       0.90              7.071429e-01   
8  2023-01-09        Apple

In [3]:
# --- 4. Integrating the Signal into a "Risk Indicator" ---

# A risk indicator should quantify potential negative impact or uncertainty.
# Here's a simple conceptual approach:
# We'll combine the sentiment shift (sudden change) with the absolute sentiment level.
# For example, a sharp drop in sentiment (negative shift) or a consistently low sentiment
# could indicate higher risk. A sharp positive shift might also indicate higher risk
# due to increased volatility/speculation, or it could be seen as a positive development.
# For simplicity, let's consider *any* significant shift (positive or negative) as increasing risk.
# And a very low absolute sentiment score as increasing risk.

# Normalize sentiment to be between 0 and 1, where 0 is most negative and 1 is most positive.
# Then, we can derive a 'sentiment risk' where lower sentiment values increase risk.
daily_sentiment['normalized_sentiment'] = (daily_sentiment['sentiment'] + 1) / 2

# Calculate a simple 'Sentiment-driven Risk Score'
# This is a conceptual score. In a real scenario, this would be much more complex,
# potentially involving financial metrics, market volatility, etc.

# Components of the risk score:
# 1. Base Risk from Sentiment: Lower sentiment implies higher risk. Max score when sentiment is -1.
#    We'll invert the normalized sentiment: 1 - normalized_sentiment.
#    So, if sentiment is -1 (normalized 0), risk_sentiment_component is 1.
#    If sentiment is 1 (normalized 1), risk_sentiment_component is 0.
daily_sentiment['risk_sentiment_component'] = 1 - daily_sentiment['normalized_sentiment']

# 2. Shift Risk: A sudden sentiment shift (flagged as 1) adds to risk.
#    We'll multiply the sentiment_shift flag by a weight (e.g., 0.5) to indicate its impact.
shift_weight = 0.5
daily_sentiment['risk_shift_component'] = daily_sentiment['sentiment_shift'] * shift_weight

# Total Risk Indicator: Sum of components.
# This score will be higher when sentiment is low, or when there's a significant shift.
daily_sentiment['risk_indicator'] = daily_sentiment['risk_sentiment_component'] + daily_sentiment['risk_shift_component']

print("Risk Indicator Calculation (first 15 rows for 'Apple'):")
print(daily_sentiment[daily_sentiment['named_entity'] == 'Apple'][['date', 'sentiment', 'rolling_avg_sentiment_7d', 'sentiment_shift', 'risk_indicator']].head(15))
print("\n" + "="*50 + "\n")

# --- Further Considerations for a Real-World Scenario ---

print("Further Considerations for a Real-World Scenario:")
print("""
1.  **Data Quality & Granularity:** Real financial news data requires robust cleaning, deduplication, and often involves much higher frequency (e.g., intraday).
2.  **Advanced NLP:**
    * **Named Entity Recognition (NER) refinement:** Ensuring 'named_entity' accurately maps to publicly traded companies/sectors.
    * **Contextual Sentiment:** Differentiating sentiment about a company vs. general market sentiment within an article. Models like FinBERT or other financial-specific LLMs would be crucial.
    * **Event Detection:** Identifying specific events (e.g., earnings calls, product launches, lawsuits) that trigger sentiment changes.
3.  **Sophisticated Shift Detection:**
    * Statistical tests (e.g., CUSUM, EWMA charts) to detect statistically significant changes in time series data.
    * Machine learning models trained to classify "shift" events based on various features.
4.  **Risk Indicator Refinement:**
    * **Integration with Financial Data:** Incorporating stock price volatility, trading volume, credit default swap (CDS) spreads, implied volatility from options, etc.
    * **Sector/Industry Averages:** Comparing a company's sentiment shift to its peers or sector average.
    * **Forward-Looking Measures:** Attempting to predict future sentiment or risk based on current shifts.
    * **Backtesting:** Rigorous backtesting of the risk indicator against actual market outcomes (e.g., drawdowns, large price movements).
    * **Thresholds:** Dynamically adjusting thresholds for 'shift' detection based on market conditions or entity-specific characteristics.
5.  **Causality:** Understanding *why* a sentiment shift occurred. Is it due to fundamental news or market noise?
6.  **Human-in-the-Loop:** For high-stakes decisions, human analysts often review flagged sentiment shifts.
""")


Risk Indicator Calculation (first 15 rows for 'Apple'):
         date  sentiment  rolling_avg_sentiment_7d  sentiment_shift  \
0  2023-01-01       0.80                       NaN                0   
1  2023-01-02       0.60                       NaN                0   
2  2023-01-03       0.10                       NaN                0   
3  2023-01-04       0.70                       NaN                0   
4  2023-01-05       0.90              6.200000e-01                0   
5  2023-01-06       0.95              6.750000e-01                0   
6  2023-01-07       0.80              6.928571e-01                0   
7  2023-01-08       0.90              7.071429e-01                0   
8  2023-01-09      -0.30              5.785714e-01                0   
9  2023-01-10      -0.90              4.357143e-01                1   
10 2023-01-11      -0.70              2.357143e-01                0   
11 2023-01-12       0.80              2.214286e-01                0   
12 2023-01-13      -0