<a href="https://colab.research.google.com/github/leman-cap13/Machine_Learning/blob/main/user_wallet_transactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# data downloading and preprocessing

In [None]:
from google.colab import files
files.upload()

In [None]:
json_path="/content/user-wallet-transactions.json"

In [None]:
import pandas as pd
import json

with open(json_path, 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df['_id'][0]

In [None]:
df['actionData'][0]

In [None]:
def extract_from_actionData(row):
    try:
        amount = int(row.get('amount', 0)) / 1e6  # əgər USDC 6 desimallıdırsa
    except:
        amount = 0

    asset = row.get('assetSymbol', 'UNKNOWN')
    price = float(row.get('assetPriceUSD', 0))
    type_ = row.get('type', 'UNKNOWN')
    return pd.Series([amount, asset, price, type_])

df[['amount', 'asset', 'price_usd', 'action_type']] = df['actionData'].apply(extract_from_actionData)


In [None]:
df['amount_usd'] = df['amount'] * df['price_usd']


In [None]:
df.head()

In [None]:
df['updatedAt'][33]

In [None]:
df['createdAt'][33]

In [None]:
df['_id'] = df['_id'].apply(lambda x: x['$oid'])

In [None]:
df['userWallet'][0]

In [None]:
import pandas as pd

def parse_mongo_date(d):
    try:
        return pd.to_datetime(d['$date'])
    except:
        return pd.NaT

df['createdAt'] = df['createdAt'].apply(parse_mongo_date)


In [None]:
df['createdAt']

In [None]:
df.columns

In [None]:
print(df['action'].unique())


In [None]:
df['is_deposit'] = (df['action'].str.lower() == 'deposit').astype(int)
df['is_deposit']

In [None]:
df['is_borrow'] = (df['action'].str.lower() == 'borrow').astype(int)
df['is_borrow']

In [None]:
df['is_repay'] = (df['action'].str.lower() == 'repay').astype(int)
df['is_repay']

In [None]:
df['is_redeem'] = (df['action'].str.lower() == 'redeemunderlying').astype(int)
df['is_redeem']

In [None]:
df['is_liquidation'] = (df['action'].str.lower() == 'liquidationcall').astype(int)
df['is_liquidation']

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

In [None]:
df['timestamp'][0]

In [None]:
print(df['timestamp'].dtype)

In [None]:
df['date'] = df['timestamp'].dt.date

In [None]:
df['date'].dtype

#Feature Engineering

In [None]:
features = df.groupby('userWallet').agg(
    total_txn_count = ('action', 'count'),
    active_days = ('date', 'nunique'),
    first_txn_date = ('timestamp', 'min'),
    last_txn_date = ('timestamp', 'max'),
    total_deposit_count = ('is_deposit', 'sum'),
    total_borrow_count = ('is_borrow', 'sum'),
    total_repay_count = ('is_repay', 'sum'),
    total_liquidation_count = ('is_liquidation', 'sum'),
    total_redeem_count = ('is_redeem', 'sum'),
    total_amount_usd = ('amount_usd', 'sum'),
    avg_amount_usd = ('amount_usd', 'mean'),
)


features['days_between_first_last'] = (features['last_txn_date'] - features['first_txn_date']).dt.days


In [None]:
features['days_between_first_last'].unique()

In [None]:
print(features['days_between_first_last'].value_counts().sort_index())

print(features[['first_txn_date', 'last_txn_date']].head(10))


In [None]:
active_days = df.groupby('userWallet')['date'].nunique()
print(active_days.head())


In [None]:
deposit_usd = df[df['is_deposit'] == 1].groupby('userWallet')['amount_usd'].sum().rename('total_deposit_usd')
borrow_usd = df[df['is_borrow'] == 1].groupby('userWallet')['amount_usd'].sum().rename('total_borrow_usd')
repay_usd = df[df['is_repay'] == 1].groupby('userWallet')['amount_usd'].sum().rename('total_repay_usd')

features = features.join([deposit_usd, borrow_usd, repay_usd])


In [None]:
features

In [None]:
features.fillna(0, inplace=True)


In [None]:
features['borrow_deposit_ratio'] = features['total_borrow_usd'] / (features['total_deposit_usd'] + 1e-6)


In [None]:
features['borrow_deposit_ratio']

In [None]:
features['repay_borrow_ratio'] = features['total_repay_usd'] / (features['total_borrow_usd'] + 1e-6)
features['repay_borrow_ratio']

In [None]:
features['has_liquidation'] = (features['total_liquidation_count'] > 0).astype(int)
features['has_liquidation']

In [None]:
features['txns_per_active_day'] = features['total_txn_count'] / (features['active_days'] + 1e-9)
features['liquidation_ratio'] = features['total_liquidation_count'] / (features['total_txn_count'] + 1e-9)
features['borrow_repay_diff'] = features['total_borrow_count'] - features['total_repay_count']
features['net_usd_flow'] = features['total_deposit_usd'] - features['total_borrow_usd']


#Scoring function

In [None]:
def min_max_norm(x):
    return (x - x.min()) / (x.max() - x.min() + 1e-9)

In [None]:
def score_wallets(features_df):
    txn_score = min_max_norm(features_df['total_txn_count'])
    deposit_score = min_max_norm(features_df['total_deposit_count'])
    borrow_score = min_max_norm(features_df['total_borrow_count'])
    repay_score = min_max_norm(features_df['total_repay_count'])
    liquidation_score = 1 - min_max_norm(features_df['total_liquidation_count'])
    amount_score = min_max_norm(features_df['total_amount_usd'])

    # Yeni feature-lər:
    txns_per_day_score = min_max_norm(features_df['txns_per_active_day'])
    liquidation_ratio_score = 1 - min_max_norm(features_df['liquidation_ratio'])
    borrow_repay_diff_score = 1 - min_max_norm(features_df['borrow_repay_diff'].clip(lower=0))
    net_usd_flow_score = min_max_norm(features_df['net_usd_flow'].clip(lower=0))

    # weights ilə birlikdə
    weights = {
        'txn': 0.15,
        'deposit': 0.1,
        'borrow': 0.1,
        'repay': 0.1,
        'liquidation': 0.15,
        'amount': 0.1,
        'txns_per_day': 0.1,
        'liquidation_ratio': 0.1,
        'borrow_repay_diff': 0.05,
        'net_usd_flow': 0.05,
    }

    score = (
        txn_score * weights['txn'] +
        deposit_score * weights['deposit'] +
        borrow_score * weights['borrow'] +
        repay_score * weights['repay'] +
        liquidation_score * weights['liquidation'] +
        amount_score * weights['amount'] +
        txns_per_day_score * weights['txns_per_day'] +
        liquidation_ratio_score * weights['liquidation_ratio'] +
        borrow_repay_diff_score * weights['borrow_repay_diff'] +
        net_usd_flow_score * weights['net_usd_flow']
    )

    score = score * 1000
    return score.round().astype(int)


In [None]:
features['credit_score'] = score_wallets(features)
print(features[['credit_score']].head())


In [None]:
features['credit_score'] = score_wallets(features)
print(features[['credit_score']].describe())


#Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
sns.histplot(features['credit_score'], bins=50, kde=True)
plt.title('Credit Score Paylanması (Histogram)')

plt.subplot(1,2,2)
sns.boxplot(x=features['credit_score'])
plt.title('Credit Score Paylanması (Boxplot)')

plt.show()


Left plot (Histogram):
This shows how credit scores are spread across all wallets. The bars represent how many wallets fall into each score range, and the smooth line (KDE) estimates the overall shape of the distribution. It helps us see where most scores concentrate and if there are any unusual peaks or gaps.

Right plot (Boxplot):
This summarizes the credit score distribution using five key statistics: minimum, first quartile (25%), median (50%), third quartile (75%), and maximum. It also helps identify any outliers. The boxplot gives a quick overview of the central tendency and variability of the scores.

In [None]:
low_score = features[features['credit_score'] <= 200]
high_score = features[features['credit_score'] >= 500]

print("Number of wallets with low score:", len(low_score))
print("Number of wallets with high score:", len(high_score))

print("Average transaction count for low score wallets:", low_score['total_txn_count'].mean())
print("Average transaction count for high score wallets:", high_score['total_txn_count'].mean())

print("Average liquidation count for low score wallets:", low_score['total_liquidation_count'].mean())
print("Average liquidation count for high score wallets:", high_score['total_liquidation_count'].mean())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

corr = features[['credit_score', 'total_txn_count', 'total_liquidation_count']].corr()

plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix Heatmap')
plt.show()


The correlation heatmap shows how strongly these variables relate to each other. The colors indicate the direction and strength of the relationship — blue means a strong positive correlation, red means a strong negative correlation, and colors near white mean little or no correlation.

For example, we see a strong positive correlation between total transaction count and credit score, meaning as transactions increase, the score tends to increase too. On the other hand, total liquidation count has a negative correlation with credit score, showing that more liquidations usually lead to a lower score.

In [None]:
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
sns.scatterplot(data=features, x='total_txn_count', y='credit_score')
plt.title('Credit Score vs Total Transaction Count')

plt.subplot(1,2,2)
sns.scatterplot(data=features, x='total_liquidation_count', y='credit_score')
plt.title('Credit Score vs Total Liquidation Count')

plt.show()


On the left graph, you can see that wallets with more transactions usually have higher credit scores. This makes sense because wallets that interact more often with the protocol tend to be more reliable or valuable.

On the right graph, the opposite happens. Wallets with more liquidations tend to have lower credit scores. That’s expected since liquidations show risky or problematic behavior, so those wallets get penalized in their score.

So basically, more activity usually means better trustworthiness, while more liquidations mean higher risk and a lower score.

#Model preparation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
print(features.columns.tolist())


In [None]:
numeric_features = [
    'total_txn_count', 'active_days', 'total_deposit_count',
    'total_borrow_count', 'total_repay_count', 'total_liquidation_count',
    'total_redeem_count', 'total_amount_usd', 'avg_amount_usd',
    'days_between_first_last', 'total_deposit_usd', 'total_borrow_usd',
    'total_repay_usd', 'borrow_deposit_ratio', 'repay_borrow_ratio',
    'has_liquidation', 'txns_per_active_day', 'liquidation_ratio',
    'borrow_repay_diff', 'net_usd_flow'
]


categorical_features = []

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# Data hazırlanması
X = features[numeric_features]
y = features['credit_score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Pipeline qurulması
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressor(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=1
    ))
])

# Modelin tren edilməsi
pipeline.fit(X_train, y_train)



In [None]:
y_pred = pipeline.predict(X_test)


print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))


In [None]:
model = pipeline.named_steps['xgb']

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = X.columns


plt.figure(figsize=(10, 6))
plt.title("Feature Importances (Gain)")
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), feature_names[indices], rotation=90)
plt.show()


In [None]:
all_scores = pipeline.predict(X)


scores_scaled = 1000 * (all_scores - np.min(all_scores)) / (np.max(all_scores) - np.min(all_scores))

features['credit_score_pred'] = scores_scaled


In [None]:
all_scores

In [None]:
scores_scaled

In [None]:
features['credit_score_pred']

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

bins = list(range(0, 1100, 100))
labels = [f"{i}-{i+99}" for i in bins[:-1]]

features['score_group'] = pd.cut(features['credit_score_pred'], bins=bins, labels=labels, include_lowest=True)

score_distribution = features['score_group'].value_counts().sort_index()

print(score_distribution)


plt.figure(figsize=(10,6))
score_distribution.plot(kind='bar')
plt.title("Credit Score Distribution (0-1000)")
plt.xlabel("Score Range")
plt.ylabel("Number of Wallets")
plt.show()


In [None]:

bins = [0, 500, 750, 1000]
labels = ['0-499', '500-749', '750-1000']
features['score_group_broad'] = pd.cut(features['credit_score_pred'], bins=bins, labels=labels, include_lowest=True)

print(features['score_group_broad'].value_counts())


low_score_wallets = features[features['score_group_broad'] == '0-499']
print(low_score_wallets.describe())

high_score_wallets = features[features['score_group_broad'] == '750-1000']
print(high_score_wallets.describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


plt.figure(figsize=(8,5))
sns.histplot(features['credit_score'], bins=30, kde=True)
plt.title("Credit Score Distribution")
plt.xlabel("Credit Score")
plt.ylabel("Count")
plt.show()


plt.figure(figsize=(8,5))
sns.scatterplot(x=features['total_txn_count'], y=features['credit_score'])
plt.title("Total Transactions vs Credit Score")
plt.xlabel("Total Transaction Count")
plt.ylabel("Credit Score")
plt.show()


plt.figure(figsize=(10,6))
sns.boxplot(x=features['score_group_broad'], y=features['total_txn_count'])
plt.title("Total Transactions by Credit Score Group")
plt.xlabel("Score Group")
plt.ylabel("Total Transactions")
plt.show()


#Model interpration (SHAP)

In [None]:
!pip install shap


In [None]:
import shap


explainer = shap.Explainer(model)


shap_values = explainer(X_test)


shap.summary_plot(shap_values, X_test)


We are using SHAP (SHapley Additive exPlanations) to interpret our XGBoost credit scoring model. SHAP helps us understand how each feature impacts the model’s predictions for each wallet.


First, we create a SHAP explainer by giving it our trained model. This explainer figures out how each feature affects the prediction for every wallet.

Then, we run the explainer on our test data, and it calculates something called SHAP values for each feature. These values basically tell us: “How much did this feature push the score up or pull it down?”

Finally, we visualize these SHAP values using a summary plot. On this plot, each feature is listed from most to least important. The points show the effect of that feature on the prediction for all wallets, and the color shows whether the feature value was high or low.

For example, if a wallet has a lot of transactions (total_txn_count), SHAP will show that this feature usually increases the credit score. Conversely, if a wallet has many liquidations (total_liquidation_count), SHAP will reveal that this feature tends to decrease the score.

In short, SHAP lets us see exactly why our model gave a certain score — which features were driving that decision — and helps us trust and explain the model better.

#Hyperparameter tuning (Optuna)

In [None]:
!pip install optuna


In [None]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
    }
    model = XGBRegressor(**params, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='r2')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best params:", study.best_params)
print("Best R2:", study.best_value)


so this code uses a tool called Optuna to help us find the best settings for our XGBoost model without us having to guess or try every possible combination ourselves.

Basically, we write a little function — that’s the objective — which tells Optuna:
“Here’s how you test one set of parameters. Train the model with these, check how well it does, and give me the score.”

Inside that function, we say:

Try tree depths between 3 and 10.

Try learning rates somewhere between 0.01 and 0.3.

Try different numbers of trees — from 100 to 1000.

Also tweak what fraction of the data and features the model uses for each tree.

For each set, the function builds a model and checks how well it predicts using cross-validation. Cross-validation just means we split the training data into parts to make sure the model really generalizes well, not just memorizes.

Then Optuna runs this function 50 times — each time with different parameters — and keeps track of which combination gives the best R-squared score (basically how well the model fits the data).

At the end, it tells us the best parameters it found and the best score.

So instead of manually trying different values and hoping for the best, Optuna does the heavy lifting — smartly exploring the options and giving us the best model setup.