Importing Libraries


In [None]:
import zipfile
import os
import pandas as pd
import numpy as np

Extraction from Zip File


In [None]:
# Create extraction directory
os.makedirs("data", exist_ok=True)

# Unzip Datasets
with zipfile.ZipFile("fear_greed_index.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

with zipfile.ZipFile("historical_data.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

print("Files extracted successfully.")


for root, dirs, files in os.walk("data"):
    for file in files:
        print(os.path.join(root, file))

Loading the dataset

In [None]:
fear_greed = pd.read_csv("data/fear_greed_index.csv")
historical = pd.read_csv("data/historical_data.csv")

Missing values / duplicates

In [None]:
def inspect_dataset(df, name):
    print(f"========== {name} ==========")
    print("Shape:", df.shape)
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nDuplicate Rows:", df.duplicated().sum())
    print("\n")

inspect_dataset(fear_greed, "Fear & Greed Index")
inspect_dataset(historical, "Historical Trading Data")

Conversion of Dataset and Aligning by date

In [None]:
fear_greed['date'] = pd.to_datetime(fear_greed['date'], errors='coerce')


historical['Timestamp'] = pd.to_numeric(historical['Timestamp'], errors='coerce')

# Convert from milliseconds to datetime
historical['datetime'] = pd.to_datetime(historical['Timestamp'], unit='ms', errors='coerce')

#daily date column
historical['date'] = historical['datetime'].dt.date
historical['date'] = pd.to_datetime(historical['date'])

print("Timestamp conversion completed successfully.")
# Merge datasets on 'date'
merged_df = pd.merge(
    historical,
    fear_greed[['date', 'value', 'classification']],
    on='date',
    how='left'
)

print("Merged Dataset Shape:", merged_df.shape)
display(merged_df.head())

Daily PnL per trader (or per account)

In [None]:
#Clean Closed PnL

merged_df['Closed PnL'] = pd.to_numeric(merged_df['Closed PnL'], errors='coerce')

# Replace missing with 0
merged_df['Closed PnL'] = merged_df['Closed PnL'].fillna(0)

print("PnL cleaned.")
#daily PnL
daily_pnl = (
    merged_df
    .groupby(['Account', 'date'])['Closed PnL']
    .sum()
    .reset_index()
)

display(daily_pnl.head())


Win rate, average trade size

In [None]:
#Win Rate per Account
merged_df['is_win'] = np.where(merged_df['Closed PnL'] > 0, 1, 0)

win_rate = (
    merged_df
    .groupby('Account')
    .agg(
        total_trades=('Closed PnL', 'count'),
        winning_trades=('is_win', 'sum')
    )
    .reset_index()
)

win_rate['win_rate'] = win_rate['winning_trades'] / win_rate['total_trades']
display(win_rate.head())

# Average Trade Size
avg_trade_size = (
    merged_df
    .groupby('Account')
    .agg(
        avg_size_tokens=('Size Tokens', 'mean'),
        avg_size_usd=('Size USD', 'mean')
    )
    .reset_index()
)
display(avg_trade_size.head())

Leverage Distribution

In [None]:
merged_df['leverage'] = merged_df['Size USD'] / merged_df['Execution Price']

leverage_stats = (
    merged_df
    .groupby('Account')['leverage']
    .describe()
    .reset_index()
)

display(leverage_stats.head())

Number of trades per day

In [None]:
trades_per_day = (
    merged_df
    .groupby('date')
    .size()
    .reset_index(name='number_of_trades')
)

display(trades_per_day.head())

Long/Short Ratio

In [None]:
long_short = (
    merged_df
    .groupby(['Account', 'Side'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

if 'BUY' in long_short.columns and 'SELL' in long_short.columns:
    long_short['long_short_ratio'] = long_short['BUY'] / (long_short['SELL'] + 1e-6)

display(long_short.head())

Combining all account metrics

In [None]:
final_metrics = (
    win_rate
    .merge(avg_trade_size, on='Account', how='left')
)

display(final_metrics.head())

Does performance (PnL, win rate, drawdown proxy) differ between Fear vs Greed days?

In [None]:

fear_labels = ['Fear', 'Extreme Fear']
greed_labels = ['Greed', 'Extreme Greed']

merged_df['sentiment_group'] = np.where(
    merged_df['classification'].isin(fear_labels),
    'Fear',
    np.where(
        merged_df['classification'].isin(greed_labels),
        'Greed',
        'Neutral'
    )
)

sentiment_distribution = merged_df['sentiment_group'].value_counts()
display(sentiment_distribution)

#Performance Comparison
performance_comparison = (
    merged_df
    .groupby('sentiment_group')
    .agg(
        avg_pnl=('Closed PnL', 'mean'),
        median_pnl=('Closed PnL', 'median'),
        win_rate=('is_win', 'mean'),
        trade_count=('Closed PnL', 'count')
    )
    .reset_index()
)

display(performance_comparison)


import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,5))
sns.barplot(data=performance_comparison, x='sentiment_group', y='avg_pnl')
plt.title("Average PnL by Sentiment Group")
plt.ylabel("Average PnL")
plt.xlabel("Sentiment")
plt.show()

Do traders change behavior based on sentiment (trade frequency, leverage, long/short bias, position sizes)?

In [None]:

behavior_analysis = (
    merged_df
    .groupby('sentiment_group')
    .agg(
        avg_leverage=('leverage', 'mean'),
        avg_trade_size=('Size USD', 'mean'),
        total_trades=('Account', 'count'),
        long_ratio=('Side', lambda x: (x == 'BUY').mean())
    )
    .reset_index()
)

display(behavior_analysis)

fig, axes = plt.subplots(1,2, figsize=(14,5))

sns.barplot(data=behavior_analysis, x='sentiment_group', y='avg_leverage', ax=axes[0])
axes[0].set_title("Average Leverage by Sentiment")

sns.barplot(data=behavior_analysis, x='sentiment_group', y='avg_trade_size', ax=axes[1])
axes[1].set_title("Average Trade Size by Sentiment")

plt.tight_layout()
plt.show()

In [None]:
# High vs Low Leverage Traders
account_leverage = (
    merged_df
    .groupby('Account')['leverage']
    .mean()
    .reset_index()
)
threshold = account_leverage['leverage'].median()
account_leverage['leverage_segment'] = np.where(
    account_leverage['leverage'] >= threshold,
    'High Leverage',
    'Low Leverage'
)
display(account_leverage.head())

In [None]:
#Frequent vs Infrequent Traders
trade_counts = (
    merged_df
    .groupby('Account')
    .size()
    .reset_index(name='total_trades')
)
freq_threshold = trade_counts['total_trades'].median()
trade_counts['frequency_segment'] = np.where(
    trade_counts['total_trades'] >= freq_threshold,
    'Frequent',
    'Infrequent'
)
display(trade_counts.head())

In [None]:
# Consistent Traders
account_win = (
    merged_df
    .groupby('Account')['is_win']
    .mean()
    .reset_index(name='win_rate')
)

win_threshold = account_win['win_rate'].median()

account_win['consistency_segment'] = np.where(
    account_win['win_rate'] >= win_threshold,
    'Consistent',
    'Inconsistent'
)

display(account_win.head())

In [None]:
# Segment vs Performance

segment_analysis = (
    merged_df
    .merge(account_leverage[['Account','leverage_segment']], on='Account')
    .merge(account_win[['Account','consistency_segment']], on='Account')
)

segment_performance = (
    segment_analysis
    .groupby(['leverage_segment','consistency_segment'])
    .agg(
        avg_pnl=('Closed PnL','mean'),
        win_rate=('is_win','mean'),
        avg_leverage=('leverage','mean')
    )
    .reset_index()
)

display(segment_performance)

In [None]:
# Visualization - Segment Performance


plt.figure(figsize=(8,6))
sns.barplot(data=segment_performance,
            x='leverage_segment',
            y='avg_pnl',
            hue='consistency_segment')

plt.title("Segment Performance Comparison")
plt.ylabel("Average PnL")
plt.show()

In [None]:
#Drawdown Proxy


daily_account_pnl = (
    merged_df
    .groupby(['Account','date'])['Closed PnL']
    .sum()
    .reset_index()
)

daily_account_pnl['cumulative_pnl'] = (
    daily_account_pnl
    .groupby('Account')['Closed PnL']
    .cumsum()
)

daily_account_pnl['rolling_max'] = (
    daily_account_pnl
    .groupby('Account')['cumulative_pnl']
    .cummax()
)

daily_account_pnl['drawdown'] = (
    daily_account_pnl['cumulative_pnl'] -
    daily_account_pnl['rolling_max']
)

display(daily_account_pnl.head())

Part C

In [None]:
# Account-level metrics
account_metrics = (
    merged_df
    .groupby('Account')
    .agg(
        avg_leverage=('leverage','mean'),
        win_rate=('is_win','mean'),
        total_trades=('Account','count')
    )
    .reset_index()
)

# Define thresholds
leverage_threshold = account_metrics['avg_leverage'].median()
win_threshold = account_metrics['win_rate'].median()

account_metrics['leverage_segment'] = np.where(
    account_metrics['avg_leverage'] >= leverage_threshold,
    'High Leverage',
    'Low Leverage'
)

account_metrics['consistency_segment'] = np.where(
    account_metrics['win_rate'] >= win_threshold,
    'Consistent',
    'Inconsistent'
)

account_metrics.head()

In [None]:
merged_df = merged_df.merge(
    account_metrics[['Account','leverage_segment','consistency_segment']],
    on='Account',
    how='left'
)

Strategy 1 - Sentiment-Based Leverage Control

In [None]:
# STRATEGY 1: Adjust leverage
def strategy1_adjust_leverage(row):

    lev = row['leverage']

    # Fear regime
    if row['sentiment_group'] == 'Fear':
        if row['leverage_segment'] == 'High Leverage' or row['consistency_segment'] == 'Inconsistent':
            return lev * 0.7
        return lev

    # Greed regime
    if row['sentiment_group'] == 'Greed':
        if row['consistency_segment'] == 'Consistent':
            return lev * 1.1
        return lev

    return lev

merged_df['adjusted_leverage_s1'] = merged_df.apply(strategy1_adjust_leverage, axis=1)

In [None]:
merged_df['adjusted_pnl_s1'] = (
    merged_df['Closed PnL'] *
    (merged_df['adjusted_leverage_s1'] / merged_df['leverage'])
)

Strategy 2 - High-Risk Trader Controls

In [None]:
merged_df['risk_tier'] = np.where(
    (merged_df['leverage_segment'] == 'High Leverage') &
    (merged_df['consistency_segment'] == 'Inconsistent'),
    'Tier 3',
    'Tier 1/2'
)

In [None]:
def strategy2_adjust(row):

    lev = row['leverage']

    if row['risk_tier'] == 'Tier 3':
        return lev * 0.6  # reduce 40%

    return lev

merged_df['adjusted_leverage_s2'] = merged_df.apply(strategy2_adjust, axis=1)

merged_df['adjusted_pnl_s2'] = (
    merged_df['Closed PnL'] *
    (merged_df['adjusted_leverage_s2'] / merged_df['leverage'])
)

In [None]:
comparison = pd.DataFrame({
    'Original Avg PnL': [merged_df['Closed PnL'].mean()],
    'Strategy 1 Avg PnL': [merged_df['adjusted_pnl_s1'].mean()],
    'Strategy 2 Avg PnL': [merged_df['adjusted_pnl_s2'].mean()]
})

comparison

In [None]:
def calculate_drawdown(series):
    cumulative = series.cumsum()
    rolling_max = cumulative.cummax()
    drawdown = cumulative - rolling_max
    return drawdown.min()

drawdown_comparison = pd.DataFrame({
    'Original DD': [calculate_drawdown(merged_df['Closed PnL'])],
    'Strategy1 DD': [calculate_drawdown(merged_df['adjusted_pnl_s1'])],
    'Strategy2 DD': [calculate_drawdown(merged_df['adjusted_pnl_s2'])]
})

drawdown_comparison

In [None]:
labels = ['Original','Strategy 1','Strategy 2']
values = [
    merged_df['Closed PnL'].mean(),
    merged_df['adjusted_pnl_s1'].mean(),
    merged_df['adjusted_pnl_s2'].mean()
]

plt.figure(figsize=(7,5))
plt.bar(labels, values)
plt.title("Average PnL Comparison")
plt.ylabel("Average PnL")
plt.show()

Bonus Part 1 - Predictive Model

In [None]:
# Create daily trader-level dataset
# Ensure sorted
merged_df = merged_df.sort_values(['Account', 'date'])

daily_df = (
    merged_df
    .groupby(['Account', 'date'])
    .agg(
        daily_pnl=('Closed PnL', 'sum'),
        avg_leverage=('leverage', 'mean'),
        trades=('Account', 'count'),
        win_rate=('is_win', 'mean'),
        avg_trade_size=('Size USD', 'mean'),
        long_ratio=('Side', lambda x: (x == 'BUY').mean()),
        sentiment_value=('value', 'mean')
    )
    .reset_index()
)

daily_df.head()

In [None]:
# Create next-day target
daily_df = daily_df.sort_values(['Account', 'date'])

daily_df['next_day_pnl'] = (
    daily_df
    .groupby('Account')['daily_pnl']
    .shift(-1)
)

daily_df['next_day_profitable'] = (
    daily_df['next_day_pnl'] > 0
).astype(int)

# Remove last-day NaNs
daily_df = daily_df.dropna()

daily_df.head()

In [None]:
# Train logistic regression model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

features = [
    'avg_leverage',
    'trades',
    'win_rate',
    'avg_trade_size',
    'long_ratio',
    'sentiment_value'
]

X = daily_df[features]
y = daily_df['next_day_profitable']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))

In [None]:
# Model interpretation

coefficients = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

coefficients

Bonus Part 2 - Trader Clustering

In [None]:
# Create account-level summary
account_summary = (
    merged_df
    .groupby('Account')
    .agg(
        avg_leverage=('leverage','mean'),
        win_rate=('is_win','mean'),
        avg_trade_size=('Size USD','mean'),
        total_trades=('Account','count'),
        long_ratio=('Side', lambda x: (x=='BUY').mean())
    )
    .reset_index()
)

account_summary.head()

In [None]:
# Clustering traders
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

cluster_features = [
    'avg_leverage',
    'win_rate',
    'avg_trade_size',
    'total_trades',
    'long_ratio'
]

scaler = StandardScaler()
scaled = scaler.fit_transform(account_summary[cluster_features])

kmeans = KMeans(n_clusters=3, random_state=42)
account_summary['cluster'] = kmeans.fit_predict(scaled)

account_summary.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,6))
sns.scatterplot(
    data=account_summary,
    x='avg_leverage',
    y='win_rate',
    hue='cluster',
    palette='Set2'
)
plt.title("Trader Behavioral Clusters")
plt.show()

In [None]:
cluster_profile = (
    account_summary
    .groupby('cluster')[cluster_features]
    .mean()
)

cluster_profile

Bonus Part 3 - Streamlit Dashboard

In [None]:
!pip install streamlit


In [None]:
import streamlit as st
import pandas as pd

st.title("Trader Behavior Insights Dashboard")

uploaded_file = st.file_uploader("Upload merged dataset (CSV)")

if uploaded_file:
    df = pd.read_csv(uploaded_file)

    st.subheader("Sentiment Distribution")
    st.bar_chart(df['sentiment_group'].value_counts())

    st.subheader("Average PnL by Sentiment")
    st.bar_chart(df.groupby('sentiment_group')['Closed PnL'].mean())

    st.subheader("Leverage Distribution")
    st.bar_chart(df.groupby('sentiment_group')['leverage'].mean())

    st.subheader("Top 10 Traders by PnL")
    top_traders = df.groupby('Account')['Closed PnL'].sum().sort_values(ascending=False).head(10)
    st.bar_chart(top_traders)