<a href="https://colab.research.google.com/github/kotianbipin/ds_Bipin_Kotian/blob/main/notebook_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Web3 Trading Team Assignment - notebook_1.ipynb
Candidate: bipin kotian

In [1]:
import os
os.makedirs("csv_files", exist_ok=True)
os.makedirs("outputs", exist_ok=True)


In [2]:
# Install required packages
!pip install --quiet gdown matplotlib seaborn scikit-learn fpdf


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone


In [3]:
import gdown

sentiment_file_id = "1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf"
trades_file_id    = "1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs"

gdown.download(f"https://drive.google.com/uc?id={sentiment_file_id}", "sentiment.csv", quiet=False)
gdown.download(f"https://drive.google.com/uc?id={trades_file_id}", "trades.csv", quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf
To: /content/sentiment.csv
100%|██████████| 90.8k/90.8k [00:00<00:00, 43.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs
To: /content/trades.csv
100%|██████████| 47.5M/47.5M [00:00<00:00, 225MB/s]


'trades.csv'

In [4]:
import pandas as pd
import numpy as np

trades = pd.read_csv("trades.csv")
sent = pd.read_csv("sentiment.csv")

print("Trades columns:", trades.columns.tolist())
print("Sentiment columns:", sent.columns.tolist())

Trades columns: ['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side', 'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL', 'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID', 'Timestamp']
Sentiment columns: ['timestamp', 'value', 'classification', 'date']


In [7]:
# -------------------------------
#  Preprocessing: trades datetime
# -------------------------------
time_cols = [c for c in trades.columns if 'time' in c.lower() or 'date' in c.lower()]
if not time_cols:
    raise ValueError("No time/date column found in trades dataset")
time_col = time_cols[0]
trades['date'] = pd.to_datetime(trades[time_col], errors='coerce').dt.date


In [6]:
# -------------------------------
# Preprocessing: sentiment dataset
# -------------------------------
sent.columns = [c.strip() for c in sent.columns]

# Detect date column
date_cols = [c for c in sent.columns if 'date' in c.lower()]
if not date_cols:
    raise ValueError("No date column found in sentiment dataset")
date_col = date_cols[0]
sent['date'] = pd.to_datetime(sent[date_col], errors='coerce').dt.date

# Detect sentiment/classification column
class_cols = [c for c in sent.columns if 'class' in c.lower() or 'fear' in c.lower() or 'greed' in c.lower()]
if not class_cols:
    raise ValueError("No Classification column found in sentiment dataset")
class_col = class_cols[0]
sent['Classification'] = sent[class_col].astype(str)

print(sent[['date','Classification']].head())


         date Classification
0  2018-02-01           Fear
1  2018-02-02   Extreme Fear
2  2018-02-03           Fear
3  2018-02-04   Extreme Fear
4  2018-02-05   Extreme Fear


In [8]:
# -------------------------------
# Aggregate daily trader metrics
# -------------------------------
agg = trades.groupby('date').agg(
    num_trades=('Account','count'),
    total_volume=('Size USD', lambda x: abs(x).sum() if pd.api.types.is_numeric_dtype(x) else np.nan),
    total_closed_pnl=('Closed PnL','sum'),
    avg_closed_pnl=('Closed PnL','mean'),
    win_rate=('Closed PnL', lambda x: (x>0).mean()),
    avg_trade_size=('Size USD','mean')
).reset_index()

# Merge with sentiment
daily = agg.merge(sent[['date','Classification']], on='date', how='left').sort_values('date')
daily['sent_code'] = daily['Classification'].map({'Fear':0,'Greed':1})


In [9]:
daily.to_csv("csv_files/daily_features.csv", index=False)

In [10]:
# -------------------------------
# Exploratory Data Analysis (EDA)
# -------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Daily total PnL by sentiment
plt.figure(figsize=(12,6))
sns.lineplot(data=daily, x='date', y='total_closed_pnl', hue='Classification')
plt.title("Daily Total PnL vs Market Sentiment")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("outputs/daily_pnl.png")
plt.close()

# Avg trade size vs Avg closed PnL
plt.figure(figsize=(6,4))
sns.scatterplot(x='avg_trade_size', y='avg_closed_pnl', hue='Classification', data=daily)
plt.title("Avg Trade Size vs Avg Closed PnL by Sentiment")
plt.savefig("outputs/scatter_trade_pnl.png")
plt.close()

# Avg leverage vs sentiment (if exists)
if 'leverage' in trades.columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Classification', y='avg_leverage', data=daily)
    plt.title("Average Leverage by Sentiment")
    plt.savefig("outputs/boxplot_leverage.png")
    plt.close()

# Number of trades per day
plt.figure(figsize=(12,6))
sns.lineplot(data=daily, x='date', y='num_trades', hue='Classification')
plt.title("Number of Trades per Day vs Market Sentiment")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("outputs/daily_trades.png")
plt.close()


In [11]:
# -------------------------------
# Statistical test: Fear vs Greed PnL
# -------------------------------
from scipy.stats import ttest_ind

fear_pnl = daily[daily['Classification']=="Fear"]['total_closed_pnl'].dropna()
greed_pnl = daily[daily['Classification']=="Greed"]['total_closed_pnl'].dropna()

if len(fear_pnl) > 0 and len(greed_pnl) > 0:
    tstat, pval = ttest_ind(fear_pnl, greed_pnl, equal_var=False)
    print("T-test Fear vs Greed Total PnL:")
    print("t-stat:", round(tstat,3), "p-value:", round(pval,4))


T-test Fear vs Greed Total PnL:
t-stat: 1.654 p-value: 0.1059


In [12]:
# -------------------------------
# Simple predictive modeling
# -------------------------------
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

daily = daily.sort_values('date')
daily['next_pnl_pos'] = (daily['total_closed_pnl'].shift(-1) > 0).astype(int)
features = ['total_volume','win_rate','avg_trade_size','sent_code']

df = daily.dropna(subset=features+['next_pnl_pos']).copy()
if df.shape[0] > 20:
    X = df[features].values
    y = df['next_pnl_pos'].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    tscv = TimeSeriesSplit(n_splits=5)
    model = LogisticRegression(max_iter=500)
    accs=[]
    for train_idx, test_idx in tscv.split(X_scaled):
        model.fit(X_scaled[train_idx], y[train_idx])
        accs.append(model.score(X_scaled[test_idx], y[test_idx]))
    print("Logistic Regression mean CV accuracy:", round(np.mean(accs),3))


Logistic Regression mean CV accuracy: 0.653


In [13]:
# -------------------------------
# Save model-ready dataset
# -------------------------------
daily.to_csv("csv_files/daily_with_labels.csv", index=False)

In [14]:
# -------------------------------
# Create placeholder ds_report.pdf
# -------------------------------
from fpdf import FPDF

pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial","B",16)
pdf.cell(0,10,"Web3 Trading Team - Data Science Report", ln=True, align="C")
pdf.ln(10)

pdf.set_font("Arial","",12)
pdf.multi_cell(0,8,"This report includes dataset description, EDA charts, insights, observations from Fear vs Greed, and modeling results.\n\n")

# Add images (charts saved previously)
chart_files = ["outputs/daily_pnl.png", "outputs/scatter_trade_pnl.png"]
for chart in chart_files:
    pdf.add_page()
    pdf.image(chart, x=10, y=20, w=180)  # adjust width/position as needed

pdf.output("ds_report.pdf")



''