# Biotech Earnings NLP Exploration
Quick-start notebook to explore sentiment and event returns with saved plots/tables.

In [None]:
from pathlib import Path

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.analysis.models import ensure_beat_miss_flag, run_linear_regression, summarize_regression, run_logistic_downdrift_model
from src.analysis.eda import load_events_with_features

CONFIG_PATH = Path('config/config.yaml')
df = load_events_with_features(CONFIG_PATH)
df = ensure_beat_miss_flag(df)
df.head()


In [None]:
summary_cols = ['prep_sent_score', 'qa_sent_score', 'tone_shift', 'abn_ret_1d', 'abn_ret_5d', 'beat_miss_flag']
summary = df[summary_cols].describe().T
summary


In [None]:
plots_dir = Path('assets/plots')
plots_dir.mkdir(parents=True, exist_ok=True)

for col in ['abn_ret_1d', 'abn_ret_5d']:
    if col not in df:
        continue
    plt.figure(figsize=(5, 4))
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(col)
    plt.tight_layout()
    plt.savefig(plots_dir / f'hist_{col}.png', dpi=150)
    plt.show()

    plt.figure(figsize=(5, 4))
    sns.scatterplot(data=df, x='qa_sent_score', y=col)
    plt.axhline(0, color='grey', linestyle='--', linewidth=1)
    plt.axvline(0, color='grey', linestyle='--', linewidth=1)
    plt.title(f'qa_sent_score vs {col}')
    plt.tight_layout()
    plt.savefig(plots_dir / f'scatter_qa_sent_vs_{col}.png', dpi=150)
    plt.show()

df_box = df.dropna(subset=['qa_sent_score']).copy()
if not df_box.empty:
    df_box['sent_bucket'] = pd.qcut(df_box['qa_sent_score'], 3, labels=['Low', 'Mid', 'High'])
    for col in ['abn_ret_1d', 'abn_ret_5d']:
        if col not in df_box:
            continue
        plt.figure(figsize=(5, 4))
        sns.boxplot(data=df_box, x='sent_bucket', y=col)
        plt.title(f'{col} by sentiment tercile')
        plt.tight_layout()
        plt.savefig(plots_dir / f'box_sent_bucket_{col}.png', dpi=150)
        plt.show()


In [None]:
predictors = ['prep_sent_score', 'qa_sent_score', 'tone_shift', 'qa_hedge_rate', 'qa_risk_rate', 'beat_miss_flag']
qa_predictors = ['qa_sent_score', 'tone_shift', 'beat_miss_flag']

ols_full = run_linear_regression(df, outcome='abn_ret_5d', predictors=predictors)
print(summarize_regression(ols_full))

ols_qa = run_linear_regression(df, outcome='abn_ret_5d', predictors=qa_predictors)
print(summarize_regression(ols_qa))

log_res = run_logistic_downdrift_model(df)
log_res.get('metrics', log_res)
