# 📒 train_risk_model.ipynb (Notebook Template - Rootstock - Sovryn - Risk Model w/ Blockscout)


## 📌 Objective
Train a risk classification model (0 = safe, 1 = risky) for Rootstock vaults using on-chain activity data from Blockscout.


In [1]:
## 📦 Setup
!pip install pandas requests scikit-learn joblib

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting charset-normalizer<4,>=2

In [5]:

## 🔧 Configuration

import requests
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

BLOCKSCOUT_API = "https://rootstock.blockscout.com/api/v2"
ROOTSTOOCK_CONTRACT = "0x2bEE6167f91d10Db23252e03dE039dA6B9047D49"



In [25]:

## 📊 Fetch Transaction Data
def fetch_txns(contract):
    url = f"{BLOCKSCOUT_API}/addresses/{contract}/transactions"
    r = requests.get(url)
    r.raise_for_status()
    return pd.DataFrame(r.json()["items"])

transactions = fetch_txns(ROOTSTOOCK_CONTRACT)
transactions['timestamp'] = pd.to_datetime(transactions['timestamp'])
transactions['value_btc'] = pd.to_numeric(transactions['value'], errors='coerce') / 1e18

print(transactions['timestamp'], "test timepstampt")
print(transactions['value_btc'], "Value BTC")

transactions['method'] = transactions['method'].fillna("unknown")
method_counts = transactions['method'].value_counts().to_dict()

print(transactions['method'], "test the transaction method")
print(method_counts, "method_counts tests")

transactions['from'] = transactions['from'].apply(lambda x: x['hash'] if isinstance(x, dict) else x)

0    2025-05-31 20:25:39+00:00
1    2025-05-31 19:02:47+00:00
2    2025-05-31 16:06:13+00:00
3    2025-05-31 16:00:39+00:00
4    2025-05-31 15:46:35+00:00
5    2025-05-31 15:40:01+00:00
6    2025-05-31 12:25:54+00:00
7    2025-05-31 12:24:28+00:00
8    2025-05-31 12:22:55+00:00
9    2025-05-31 11:12:00+00:00
10   2025-05-31 11:09:22+00:00
11   2025-05-31 10:15:22+00:00
12   2025-05-31 10:11:16+00:00
13   2025-05-31 09:29:34+00:00
14   2025-05-31 09:07:46+00:00
15   2025-05-31 09:04:36+00:00
16   2025-05-31 08:48:35+00:00
17   2025-05-31 07:03:41+00:00
18   2025-05-31 07:00:00+00:00
19   2025-05-31 06:48:40+00:00
20   2025-05-31 06:46:19+00:00
21   2025-05-31 02:39:48+00:00
22   2025-05-31 02:34:39+00:00
23   2025-05-31 02:20:46+00:00
24   2025-05-31 02:09:58+00:00
25   2025-05-30 23:44:17+00:00
26   2025-05-30 23:41:41+00:00
27   2025-05-30 23:39:03+00:00
28   2025-05-30 23:36:45+00:00
29   2025-05-30 21:11:05+00:00
30   2025-05-30 18:17:00+00:00
31   2025-05-30 18:13:26+00:00
32   202

  transactions['timestamp'] = pd.to_datetime(transactions['timestamp'])


In [26]:
## 🧠 Feature Engineering
features = {
    "total_txns": len(transactions),
    "unique_users": transactions['from'].nunique(),
    "avg_value": transactions['value_btc'].mean(),

}

features.update({
    "deposits": method_counts.get("deposit", 0),
    "withdraws": method_counts.get("withdraw", 0),
    "approvals": method_counts.get("approve", 0),
    "unique_methods": transactions['method'].nunique(),
    "failed_txns": len(transactions[transactions['status'] == 'failed']),
})

features["label"] = 1 if features["avg_value"] < 0.001 else 0


In [None]:
## 🏷️ Labeling (Manual or Rule-Based for MVP)

features["label"] = 1 if features["avg_value"] < 0.001 else 0


In [20]:
## 🏷️ Save method analysis extracted from blockscout api rest v2

with open("data/method_analysis.txt", "w") as f:
    f.write("📜 All Transaction Methods:\n")
    f.write(transactions['method'].to_string(index=False))
    f.write("\n\n📊 Method Counts:\n")
    for method, count in method_counts.items():
        f.write(f"{method}: {count}\n")

print("✅ Saved method analysis to data/method_analysis.txt")

✅ Saved method analysis to ml-risk/method_analysis.txt


In [22]:
with open("data/timestamp_analysis.txt", "w") as f:
    f.write("📜 timestamps:\n")
    f.write(transactions['timestamp'].to_string(index=False))
    f.write("\n\n📊 Total timestamp Counts:\n")
    for timestamp, count in twransactions['timestamp'].items():
        f.write(f"{timestamp}: {count}\n")

print("✅ Saved timestamp analysis to data/timestamp_analysis.txt")

✅ Saved timestamp analysis to data/timestamp_analysis.txt


In [27]:
with open("data/value_btc_analysis.txt", "w") as f:
    f.write("📜 value_btc:\n")
    f.write(transactions['value_btc'].to_string(index=False))
    f.write("\n\n📊 Total value_btc Counts:\n")
    for timestamp, count in transactions['value_btc'].items():
        f.write(f"{timestamp}: {count}\n")

print("✅ Saved timestamp analysis to data/value_btc_analysis.txt")

✅ Saved timestamp analysis to data/value_btc_analysis.txt


In [21]:
## Save the rootstock sovryn.app Data

pd.DataFrame([features]).to_csv("rootstock_lending_historical.csv", index=False)

print("✅ Feature set saved to rootstock_lending_historical.csv")


✅ Feature set saved to rootstock_lending_historical.csv


In [None]:
## 🧠 Advanced Feature Engineering
def engineer_features(transactions):
    # Time-based features
    transactions['hour'] = transactions['timestamp'].dt.hour
    transactions['day_of_week'] = transactions['timestamp'].dt.dayofweek
    transactions['is_weekend'] = transactions['day_of_week'].isin([5, 6]).astype(int)
    
    # Transaction value features
    transactions['value_category'] = pd.qcut(transactions['value_btc'], q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])
    
    # User behavior features
    user_stats = transactions.groupby('from').agg({
        'value_btc': ['count', 'mean', 'std', 'sum'],
        'status': lambda x: (x == 'failed').mean()
    }).reset_index()
    
    # Method-based features
    method_stats = transactions.groupby('method').agg({
        'value_btc': ['count', 'mean', 'std'],
        'status': lambda x: (x == 'failed').mean()
    }).reset_index()
    
    # Time window features (last 24h, 7d)
    now = transactions['timestamp'].max()
    last_24h = transactions[transactions['timestamp'] > (now - pd.Timedelta(days=1))]
    last_7d = transactions[transactions['timestamp'] > (now - pd.Timedelta(days=7))]
    
    features = {
        # Basic metrics
        'total_txns': len(transactions),
        'unique_users': transactions['from'].nunique(),
        'avg_value': transactions['value_btc'].mean(),
        'std_value': transactions['value_btc'].std(),
        
        # Time-based patterns
        'weekend_ratio': transactions['is_weekend'].mean(),
        'peak_hour_txns': transactions.groupby('hour')['value_btc'].count().max(),
        
        # Recent activity
        'txns_24h': len(last_24h),
        'txns_7d': len(last_7d),
        'volume_24h': last_24h['value_btc'].sum(),
        'volume_7d': last_7d['value_btc'].sum(),
        
        # Risk indicators
        'failed_txns_ratio': (transactions['status'] == 'failed').mean(),
        'high_value_txns_ratio': (transactions['value_btc'] > transactions['value_btc'].quantile(0.95)).mean(),
        
        # User behavior
        'avg_txns_per_user': user_stats[('value_btc', 'count')].mean(),
        'user_value_std': user_stats[('value_btc', 'std')].mean(),
        
        # Method patterns
        'unique_methods': transactions['method'].nunique(),
        'method_concentration': (transactions['method'].value_counts() / len(transactions)).max()
    }
    
    return features


In [None]:
## 🎯 Model Training
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Generate features
features = engineer_features(transactions)

# Create feature matrix
X = pd.DataFrame([features])
y = np.array([1 if features['failed_txns_ratio'] > 0.1 or features['high_value_txns_ratio'] > 0.2 else 0])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

In [None]:
# Cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Average CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Train final model
model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = model.predict(X_test_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

## 💾 Save Model and Scaler
import joblib

# Save model and scaler
joblib.dump(model, 'models/risk_model.joblib')
joblib.dump(scaler, 'models/risk_scaler.joblib')

print("\n✅ Model and scaler saved to models/ directory")