# Trader Behavior vs Market Sentiment

This notebook performs end-to-end analysis of trader behavior (Hyperliquid historical trades) versus Bitcoin market sentiment (Fearâ€“Greed index). It includes data loading, cleaning, aggregation, merging with sentiment, EDA, statistical tests, a baseline predictive model, and saving outputs.

Files expected (already uploaded):
- `/mnt/data/historical_data.csv`
- `/mnt/data/fear_greed_index.csv`

Run the cells sequentially. If you are using Colab, upload the two CSVs to the same path or modify paths accordingly.

In [None]:
# Setup: imports and output directories
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

os.makedirs("/mnt/data/outputs", exist_ok=True)
os.makedirs("/mnt/data/csv_files", exist_ok=True)

print("Ready. Outputs will be saved under /mnt/data/outputs and /mnt/data/csv_files")

In [None]:
# Load datasets (paths may be adjusted if you keep files elsewhere)
trades_path = "/mnt/data/historical_data.csv"
sent_path = "/mnt/data/fear_greed_index.csv"

trades = pd.read_csv(trades_path, low_memory=False)
sent = pd.read_csv(sent_path, low_memory=False)

print("Trades shape:", trades.shape)
print("Sentiment shape:", sent.shape)

display(trades.head())
display(sent.head())

In [None]:
# Parse datetime columns and basic cleaning
# Identify time / pnl columns heuristically
time_col = None
for c in trades.columns:
    if 'time' in c.lower() or 'timestamp' in c.lower():
        time_col = c
        break
if time_col is None:
    raise ValueError("No time column found. Columns: " + ", ".join(trades.columns))
trades[time_col] = pd.to_datetime(trades[time_col], errors='coerce')
trades['date'] = trades[time_col].dt.normalize()

# Find pnl-like column
pnl_col = None
for c in trades.columns:
    if 'pnl' in c.lower() or 'closed' in c.lower():
        pnl_col = c
        break
if pnl_col is None:
    raise ValueError("No closed PnL column found. Columns: " + ", ".join(trades.columns))

# Standardize common columns
cols_map = {}
for c in trades.columns:
    low = c.lower()
    if 'account' in low:
        cols_map[c] = 'account'
    if 'coin' in low or 'symbol' in low:
        cols_map[c] = 'symbol'
    if 'size' in low and 'usd' not in low:
        cols_map[c] = 'size'
    if 'leverage' in low:
        cols_map[c] = 'leverage'
trades = trades.rename(columns=cols_map)

# Ensure numeric
for col in [pnl_col, 'size', 'leverage']:
    if col in trades.columns:
        trades[col] = pd.to_numeric(trades[col], errors='coerce')

before = len(trades)
trades = trades.dropna(subset=['date', pnl_col])
after = len(trades)
print(f"Dropped {before-after} rows without date or closedPnL. Remaining rows: {after}")

display(trades.head())

In [None]:
# Aggregate per account-date
group_cols = ['account','date'] if 'account' in trades.columns else ['date']
agg = trades.groupby(group_cols).agg(
    total_trades = (pnl_col, 'count'),
    total_pnl = (pnl_col, 'sum'),
    win_trades = (pnl_col, lambda x: (x>0).sum()),
    loss_trades = (pnl_col, lambda x: (x<=0).sum()),
    avg_leverage = ('leverage', 'mean') if 'leverage' in trades.columns else (pnl_col, lambda x: np.nan),
    max_leverage = ('leverage', 'max') if 'leverage' in trades.columns else (pnl_col, lambda x: np.nan),
    avg_size = ('size', 'mean') if 'size' in trades.columns else (pnl_col, lambda x: np.nan),
    unique_symbols = ('symbol', pd.Series.nunique) if 'symbol' in trades.columns else (pnl_col, lambda x: np.nan)
).reset_index()

agg['win_rate'] = agg['win_trades'] / agg['total_trades']
agg['profitable_day'] = (agg['total_pnl'] > 0).astype(int)

agg.to_csv("/mnt/data/csv_files/daily_agg_by_account.csv", index=False)
print("Saved /mnt/data/csv_files/daily_agg_by_account.csv")
display(agg.head())

In [None]:
# Prepare sentiment and merge
date_col_sent = None
for c in sent.columns:
    if 'date' in c.lower():
        date_col_sent = c
        break
if date_col_sent is None:
    raise ValueError("No date column in sentiment file. Columns: " + ", ".join(sent.columns))
sent[date_col_sent] = pd.to_datetime(sent[date_col_sent], errors='coerce')
sent['date'] = sent[date_col_sent].dt.normalize()

# Identify classification and score columns
sent_class_col = None
sent_score_col = None
for c in sent.columns:
    low = c.lower()
    if 'class' in low or 'classification' in low:
        sent_class_col = c
    if 'score' in low or 'value' in low or 'index' in low:
        sent_score_col = c

if sent_class_col is None:
    raise ValueError("No classification column found in sentiment file.")

sent_small = sent[['date', sent_class_col] + ([sent_score_col] if sent_score_col else [])].copy()
sent_small = sent_small.rename(columns={sent_class_col: 'classification'})
if sent_score_col:
    sent_small = sent_small.rename(columns={sent_score_col: 'score'})
else:
    sent_small['score'] = sent_small['classification'].map(lambda x: 1 if str(x).strip().lower().startswith('g') else (0 if str(x).strip().lower().startswith('f') else np.nan))

merged = agg.merge(sent_small, on='date', how='left')
merged.to_csv("/mnt/data/csv_files/merged_daily_with_sentiment.csv", index=False)
print("Saved /mnt/data/csv_files/merged_daily_with_sentiment.csv")
display(merged.head())

In [None]:
# Quick EDA plots (saved to /mnt/data/outputs)
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.hist(merged['total_pnl'].dropna(), bins=60)
plt.title("Distribution of daily total_pnl (per account-day)")
plt.xlabel("total_pnl")
plt.ylabel("count")
plt.tight_layout()
plt.savefig("/mnt/data/outputs/total_pnl_hist.png")
plt.show()

classes = merged['classification'].dropna().unique().tolist()
data_to_plot = [merged.loc[merged['classification']==c, 'total_pnl'].dropna() for c in classes]
plt.figure(figsize=(8,5))
plt.boxplot(data_to_plot, labels=classes, showfliers=False)
plt.title("Total PnL by Sentiment Classification (boxplot)")
plt.ylabel("total_pnl")
plt.tight_layout()
plt.savefig("/mnt/data/outputs/total_pnl_by_class_boxplot.png")
plt.show()

ts = merged.groupby('date').agg(platform_total_pnl=('total_pnl','sum'), mean_score=('score','mean')).reset_index()
plt.figure(figsize=(10,5))
plt.plot(ts['date'], ts['platform_total_pnl'], label='platform_total_pnl')
if 'score' in merged.columns:
    plt.plot(ts['date'], ts['mean_score'], label='mean_sentiment_score')
plt.title("Platform total PnL vs mean sentiment score (by date)")
plt.legend()
plt.tight_layout()
plt.savefig("/mnt/data/outputs/platform_pnl_vs_sentiment.png")
plt.show()

print('Saved plots to /mnt/data/outputs')

In [None]:
# Statistical tests: compare total_pnl on Fear vs Greed days
from scipy import stats
fear_pnl = merged.loc[merged['classification'].str.lower().str.contains('fear', na=False), 'total_pnl'].dropna()
greed_pnl = merged.loc[merged['classification'].str.lower().str.contains('greed', na=False), 'total_pnl'].dropna()

if len(fear_pnl)>0 and len(greed_pnl)>0:
    t_res = stats.ttest_ind(fear_pnl, greed_pnl, equal_var=False)
    mw_res = stats.mannwhitneyu(fear_pnl, greed_pnl, alternative='two-sided')
    print('T-test:', t_res)
    print('Mann-Whitney U:', mw_res)
else:
    print('Not enough data for fear vs greed tests')

In [None]:
# Baseline model: predict profitable_day
model_df = merged.copy()
features = ['total_trades','avg_leverage','avg_size','win_rate','unique_symbols','score']
for f in features:
    if f not in model_df.columns:
        model_df[f] = np.nan

model_df = model_df.dropna(subset=['profitable_day'])
model_df = model_df.dropna(subset=['total_trades','win_rate'])
model_df[features] = model_df[features].fillna(0)

X = model_df[features].values
y = model_df['profitable_day'].values

if len(np.unique(y))>1:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_s, y_train)
y_pred = clf.predict(X_test_s)
y_proba = clf.predict_proba(X_test_s)[:,1] if hasattr(clf, 'predict_proba') else clf.decision_function(X_test_s)

print('Accuracy:', accuracy_score(y_test, y_pred))
try:
    print('ROC AUC:', roc_auc_score(y_test, y_proba))
except:
    print('ROC AUC not available')

print('\nClassification report:')
print(classification_report(y_test, y_pred, zero_division=0))
print('\nConfusion matrix:')
print(confusion_matrix(y_test, y_pred))

coef_df = pd.DataFrame({'feature': features, 'coefficient': clf.coef_.flatten()})
coef_df.to_csv('/mnt/data/outputs/logreg_coefficients.csv', index=False)
display(coef_df)

## Save outputs and final notes
All CSVs and plots are saved to `/mnt/data/csv_files` and `/mnt/data/outputs` respectively.

Next steps you may want to do locally:
- Convert this notebook to PDF for submission (`File -> Print` in Colab or use nbconvert).
- Push the notebook and the `csv_files` and `outputs` folders to your GitHub repo.
- Attach the repo link in the application email.

Good luck!