In [2]:
import matplotlib.pyplot as plt
import torch
import json
import pandas as pd
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForSequenceClassification
from transformers.pipelines.pt_utils import KeyDataset
import os
from datasets import Dataset

In [1]:
class Event:
    def __init__(self, NEWS_PATH, STAKES_PATH, FOLDER_PATH):
        self.NEWS_PATH = NEWS_PATH
        self.STAKES_PATH = STAKES_PATH
        self.FOLDER_PATH = FOLDER_PATH
        self.news = self._load_news()
        self.stakes = self._load_stakes()

        self._save_price_chart()

    def _load_news(self):
        news = pd.read_csv(self.NEWS_PATH)
        news['date'] = pd.to_datetime(news['date']).dt.tz_localize(None)
        return news

    def _load_stakes(self):
        with open(self.STAKES_PATH, 'r', encoding='utf-8') as f:
            data = json.load(f)

        self.event_id = data["event"]["id"]
        self.event_title = data["event"]["title"]

        stakes = pd.DataFrame({
            'Date': data['price_data']['times'],
            'Price': data['price_data']['prices']
        })

        stakes['Date'] = pd.to_datetime(stakes['Date']).dt.tz_localize(None)
        stakes['Date'] = stakes['Date'].dt.date

        stakes = stakes.groupby('Date', as_index=False)['Price'].mean()

        return stakes

    def _save_price_chart(self):
        plt.figure(figsize=(10, 6))

        plt.plot(self.stakes['Date'], self.stakes['Price'],
                 linestyle='-', color='blue', label='Ціна')

        plt.title('Price change for event')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.grid(True)
        plt.legend()

        plt.xticks(rotation=45)
        plt.tight_layout()

        save_path = os.path.join(self.FOLDER_PATH, 'Price_Chart.png')
        plt.savefig(save_path)
        plt.close()


In [4]:
Events = [
    Event(
        NEWS_PATH='DATA/UA_war_end/news_ukraine_13551.csv',
        STAKES_PATH='DATA/UA_war_end/trump-wins-ends-ukraine-war-in-90-days-2c482.json',
        FOLDER_PATH = 'DATA/UA_war_end',
    ),
    Event(
        NEWS_PATH='DATA/Israel_Hamas/news_israel_21257.csv',
        STAKES_PATH='DATA/Israel_Hamas/israel-x-hamas-ceasefire-before-july-2025-79c05.json',
        FOLDER_PATH = 'DATA/Israel_Hamas',
    ),
    Event(
        NEWS_PATH='DATA/TikTok_ban/news_tiktok_12641.csv',
        STAKES_PATH='DATA/TikTok_ban/tiktok-banned-in-the-us-before-may-2025-52cc0.json',
        FOLDER_PATH = 'DATA/TikTok_ban',
    ),
    Event(
        NEWS_PATH='DATA/US_president_Trump/news_trump_903193.csv',
        STAKES_PATH='DATA/US_president_Trump/presidential-election-winner-2024-Trump.json',
        FOLDER_PATH = 'DATA/US_president_Trump',
    ),
    Event(
        NEWS_PATH='DATA/US_president_Kamala/news_kamala_903193.csv',
        STAKES_PATH='DATA/US_president_Kamala/presidential-election-winner-2024-Kamala.json',
        FOLDER_PATH = 'DATA/US_president_Kamala',
    ),
]

In [5]:
model_name = "ProsusAI/finbert"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    chat_template=None
)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
def predict_sentiment_finbert(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    predicted_class_idx = torch.argmax(predictions, dim=1).item()

    return model.config.id2label[predicted_class_idx]

In [8]:
def compute_sentiment_for_event(event, predict_func):
    results = []

    for title in tqdm(event.news['title'], desc=f"Sentiment for {event.event_title}"):
        try:
            res = predict_func(str(title))
        except Exception:
            res = "error"
        results.append(res)

    news = event.news.copy()
    news['sentiment_predicted_FinBERT'] = results

    news = news.sort_values(by='date')

    sentiment_map = {
        'positive': 1,
        'neutral': 0,
        'negative': -1
    }
    news['numeric_sentiment_FinBERT'] = news['sentiment_predicted_FinBERT'].map(sentiment_map)

    news['sentiment_trend_FinBERT'] = news['numeric_sentiment_FinBERT'].cumsum()

    event.news = news

    save_path = os.path.join(event.FOLDER_PATH, str(event.event_id)+"_with_sentiment.csv")
    news.to_csv(save_path, index=False)

    return news


In [9]:
for e in Events:
    compute_sentiment_for_event(e, predict_sentiment_finbert)

Sentiment for Trump ends Ukraine war in first 90 days?: 100%|██████████| 1462/1462 [00:08<00:00, 179.96it/s]
Sentiment for Israel x Hamas ceasefire before July?: 100%|██████████| 201/201 [00:01<00:00, 192.53it/s]
Sentiment for TikTok banned in the US before May 2025?: 100%|██████████| 247/247 [00:01<00:00, 186.85it/s]
Sentiment for Presidential Election Winner 2024: 100%|██████████| 1088/1088 [00:05<00:00, 187.77it/s]
Sentiment for Presidential Election Winner 2024: 100%|██████████| 1726/1726 [00:09<00:00, 184.22it/s]


In [10]:
def load_qwen_model(model_name="Qwen/Qwen2.5-14B-Instruct"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        return_full_text=False
    )

    return model, tokenizer, pipe

In [11]:
def analyze_news_with_qwen(news, model, tokenizer, pipe, event):
    system_prompt = f"""
You are a strict event-impact classifier.

Your task is to evaluate ONLY how the given news headline affects the probability of THIS EXACT EVENT:

    EVENT: "{event.event_title}"

You must ignore all other political actors, candidates, elections, or indirect implications.

=====================================================
SCORING RULE (STRICT AND EXCLUSIVE):
-----------------------------------------------------
Return one of exactly three values:

1  = The headline INCREASES the probability that the EVENT happens.
0  = The headline does NOT change the probability (irrelevant, unclear, mixed, or no directional impact).
-1 = The headline DECREASES the probability that the EVENT happens.

=====================================================
VERY IMPORTANT:
- The score MUST strictly follow the explanation (reason).
- The explanation MUST strictly explain WHY the score logically follows.
- Legal challenges, ballot removal, indictments, or voter suppression are directional ONLY if they clearly help or hurt the candidate relevant to THIS EVENT.
- If uncertain → ALWAYS return 0.


EVENT-SPECIFIC GUIDANCE:
1. "Presidential Election Winner 2024 – how likely Kamala wins":
    - +1 if headline helps Kamala or harms Trump.
    - -1 if headline harms Kamala or helps Trump.
    - 0 if unrelated to election dynamics.

2. "Presidential Election Winner 2024 – how likely Trump wins":
    - +1 if headline helps Trump or harms Kamala.
    - -1 if headline harms Trump or helps Kamala.
    - 0 if unrelated.

3. "Trump ends Ukraine war in first 90 days?":
    - +1 if headline suggests higher likelihood that Trump can/will end the war quickly.
    - -1 if headline shows obstacles, escalation, or reduced ability for Trump to end the war.
    - 0 if unrelated to Ukraine war, negotiations, or Trump's influence.

4. "Israel × Hamas ceasefire before July 2025":
    - +1 if headline indicates progress toward ceasefire.
    - -1 if headline indicates escalation or breakdown of negotiations.
    - 0 if unrelated to the conflict.

5. "TikTok banned in the US before May 2025?":
    - +1 if headline shows progress toward a ban (legal, political, regulatory momentum).
    - -1 if headline shows obstacles, delays, vetoes, or weakening of the ban effort or negotiations of sell of TikTok to US company.
    - 0 if unrelated.

GENERAL RULES:
- Base your judgment ONLY on direct implications of the headline.
- DO NOT infer long-term or speculative outcomes beyond what is reasonable.
- DO NOT add extra information not present in the headline.
- Explanation must be short, factual, and tied only to the headline content.

OUTPUT FORMAT: JSON only.
{{"score": -1|0|1, "reason": "short explanation"}}

"""

    prompts = []
    for title in news['title']:
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Headline: {str(title)}"}
        ]
        prompts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

    BATCH_SIZE = 8
    qwen_scores = []
    qwen_reasons = []

    dataset = Dataset.from_dict({"text": prompts})

    results_iterator = pipe(
        KeyDataset(dataset, "text"),
        batch_size=BATCH_SIZE,
        max_new_tokens=120,
        do_sample=False,
        temperature=0.2,
        pad_token_id=tokenizer.pad_token_id
    )

    for out in tqdm(results_iterator, total=len(prompts), desc="Processing Batches"):
        try:
            result_text = out[0]['generated_text'].strip()

            start = result_text.find('{')
            end = result_text.rfind('}') + 1

            if start != -1 and end != -1:
                data = json.loads(result_text[start:end])
                qwen_scores.append(data.get("score", 0))
                qwen_reasons.append(data.get("reason", "No reason"))
            else:
                qwen_scores.append(0)
                qwen_reasons.append("JSON Error (Format)")

        except Exception as e:
            qwen_scores.append(0)
            qwen_reasons.append(f"Error: {e}")

    news['sentiment_predicted_QWEN'] = pd.to_numeric(pd.Series(qwen_scores), errors='coerce').fillna(0).astype(int)
    news['qwen_reason'] = qwen_reasons

    news = news.sort_values(by='date')
    news['sentiment_trend_QWEN'] = news['sentiment_predicted_QWEN'].cumsum()

    return news


In [12]:
model, tokenizer, pipe = load_qwen_model()

for event in Events:
    news_with_sentiment = analyze_news_with_qwen(event.news, model, tokenizer, pipe, event)

    event.news = news_with_sentiment

    save_path = os.path.join(event.FOLDER_PATH, str(event.event_id)+"news_with_sentiment_QWEN.csv")
    news_with_sentiment.to_csv(save_path, index=False)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Device set to use cuda:0
Processing Batches:   0%|          | 0/1462 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing Batches: 100%|██████████| 1462/1462 [21:05<00:00,  1.15it/s]
Processing Batches: 100%|██████████| 201/201 [02:27<00:00,  1.36it/s]
Processing Batches: 100%|██████████| 247/247 [03:13<00:00,  1.28it/s]
Processing Batches: 100%|██████████| 1088/1088 [16:08<00:00,  1.12it/s]
Processing Batches: 100%|██████████| 1726/1726 [26:10<00:00,  1.10it/s]


In [13]:
def aggregate_daily_sentiment(event):
    news = event.news.copy()
    news['date_only'] = pd.to_datetime(news['date']).dt.date

    daily_finbert = (
        news.groupby('date_only', as_index=False)['numeric_sentiment_FinBERT']
            .sum()
            .rename(columns={'numeric_sentiment_FinBERT': 'daily_sentiment_FinBERT'})
    )
    daily_finbert['sentiment_trend_FinBERT'] = daily_finbert['daily_sentiment_FinBERT'].cumsum()

    daily_qwen = (
        news.groupby('date_only', as_index=False)['sentiment_predicted_QWEN']
            .sum()
            .rename(columns={'sentiment_predicted_QWEN': 'daily_sentiment_QWEN'})
    )
    daily_qwen['sentiment_trend_QWEN'] = daily_qwen['daily_sentiment_QWEN'].cumsum()

    daily = daily_finbert.merge(
        daily_qwen[['date_only', 'sentiment_trend_QWEN']],
        on='date_only',
        how='outer'
    ).sort_values('date_only')

    daily_price = event.stakes.copy()
    daily_price['date'] = pd.to_datetime(daily_price['Date'])
    daily.dropna(subset=['date_only'], inplace=True)
    daily['date_only'] = pd.to_datetime(daily['date_only'])

    merged = daily_price.merge(
        daily,
        left_on='date',
        right_on='date_only',
        how='left'
    )

    merged = merged.drop(columns=['date_only', 'Date'], errors='ignore')

    merged['sentiment_trend_FinBERT'] = merged['sentiment_trend_FinBERT'].ffill().fillna(0)
    merged['sentiment_trend_QWEN'] = merged['sentiment_trend_QWEN'].ffill().fillna(0)

    event.daily_merged = merged

    return merged


In [14]:
for e in Events:
    aggregate_daily_sentiment(e)

In [19]:
def plot_price_vs_sentiment(event):
    df = event.daily_merged.copy()

    fig, ax1 = plt.subplots(figsize=(12, 6))

    ax1.set_xlabel('Date')
    ax1.set_ylabel('Price')
    line1 = ax1.plot(df['date'], df['Price'], label='Price', color='blue')
    ax1.tick_params(axis='y')

    ax2 = ax1.twinx()
    ax2.set_ylabel('Sentiment Trend')
    line2 = ax2.plot(df['date'], df['sentiment_trend_QWEN'], color='orange', label='Sentiment Trend Qwen2.5-14B-Instruct')
    line3 = ax2.plot(df['date'], df['sentiment_trend_FinBERT'], color='green', label='Sentiment Trend FinBERT')
    ax2.tick_params(axis='y')

    lines = line1 + line2 + line3
    labels = [l.get_label() for l in lines]
    plt.legend(lines, labels, loc='upper left')

    plt.title(f'Chance of event happening(polymarket estimate) vs Sentiment Trend Over Time: {event.event_title}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    save_path = os.path.join(event.FOLDER_PATH, 'Price_vs_Sentiment_Trend.png')
    plt.savefig(save_path)
    plt.close(fig)


In [20]:
for e in Events:
    plot_price_vs_sentiment(e)


In [21]:
def compute_price_sentiment_correlations(event):
    df = event.daily_merged.copy()

    corr_qwen = df['Price'].corr(df['sentiment_trend_QWEN'])
    corr_finbert = df['Price'].corr(df['sentiment_trend_FinBERT'])


    spearman_qwen = df['Price'].corr(df['sentiment_trend_QWEN'], method='spearman')
    spearman_finbert = df['Price'].corr(df['sentiment_trend_FinBERT'], method='spearman')

    results = {
        'event_title': event.event_title,
        'pearson_QWEN': corr_qwen,
        'pearson_FinBERT': corr_finbert,
        'spearman_QWEN': spearman_qwen,
        'spearman_FinBERT': spearman_finbert
    }

    return results


In [22]:
correlations = []

for e in Events:
    corr = compute_price_sentiment_correlations(e)
    correlations.append(corr)


correlations_df = pd.DataFrame(correlations)
correlations_df

Unnamed: 0,event_title,pearson_QWEN,pearson_FinBERT,spearman_QWEN,spearman_FinBERT
0,Trump ends Ukraine war in first 90 days?,0.788006,0.820602,0.729183,0.733059
1,Israel x Hamas ceasefire before July?,0.756916,0.792276,0.732376,0.733058
2,TikTok banned in the US before May 2025?,0.681422,-0.792101,0.695208,-0.737724
3,Presidential Election Winner 2024,0.257913,-0.296768,0.287798,-0.277895
4,Presidential Election Winner 2024,0.951197,-0.933479,0.728471,-0.719701
