In [None]:
print("hi")

hi


In [None]:

!unzip /content/preprocessed.zip

Archive:  /content/preprocessed.zip
  inflating: preprocessed_token_terminal_active_addresses_daily.csv  
  inflating: preprocessed_token_terminal_unique_transacting_wallets.csv  
  inflating: preprocessed_token_terminal_transaction_count.csv  
  inflating: preprocessed_token_terminal_total_transaction_fees.csv  
  inflating: preprocessed_token_terminal_market_cap_circulating.csv  
  inflating: preprocessed_token_terminal_transaction_volume.csv  
  inflating: preprocessed_token_terminal_gas_used.csv  
  inflating: preprocessed_token_terminal_tokenholder_revenue.csv  
  inflating: preprocessed_token_terminal_token_incentives.csv  
  inflating: preprocessed_token_terminal_user_dau.csv  
  inflating: preprocessed_token_terminal_expenses.csv  
  inflating: preprocessed_token_terminal_user_wau.csv  
  inflating: preprocessed_token_terminal_gross_profit.csv  
  inflating: preprocessed_token_terminal_market_cap_fully_diluted.csv  
  inflating: preprocessed_token_terminal_active_developers.csv

In [None]:
# prompt: read all csv files and print their name along with their columns

import glob
import pandas as pd

csv_files = glob.glob('*.csv')

for csv_file in csv_files:
    print(f"File: {csv_file}")
    try:
        df = pd.read_csv(csv_file)
        print("Columns:", df.columns.tolist())
    except Exception as e:
        print(f"Error reading {csv_file}: {e}")
    print("-" * 20)

File: preprocessed_token_terminal_tokenholders.csv
Columns: ['project_id', 'year_month', 'timestamp', 'project_name', 'metric_id', 'value']
--------------------
File: preprocessed_token_terminal_market_cap_fully_diluted.csv
Columns: ['project_id', 'year_month', 'timestamp', 'project_name', 'metric_id', 'value']
--------------------
File: preprocessed_token_terminal_tokenholder_revenue.csv
Columns: ['project_id', 'year_month', 'value']
--------------------
File: preprocessed_token_terminal_user_mau.csv
Columns: ['project_id', 'year_month', 'timestamp', 'project_name', 'metric_id', 'value']
--------------------
File: preprocessed_token_terminal_transaction_count.csv
Columns: ['project_id', 'year_month', 'value']
--------------------
File: preprocessed_token_terminal_market_cap_circulating.csv
Columns: ['project_id', 'year_month', 'timestamp', 'project_name', 'metric_id', 'value']
--------------------
File: preprocessed_token_terminal_trade_count.csv
Columns: ['project_id', 'year_month', 

In [None]:
import pandas as pd
import numpy as np
import os

# Directory containing preprocessed files
folder = "/content"

# Load all relevant files into a dictionary
def load_csv(metric_name):
    path = os.path.join(folder, f"preprocessed_token_terminal_{metric_name}.csv")
    print(f"📥 Loading {metric_name} from {path}")
    return pd.read_csv(path)

# Load required dataframes
metric_names = [
    "market_cap_circulating", "market_cap_fully_diluted", "revenue", "tokenholders",
    "user_dau", "user_mau", "fees", "fees_supply_side", "transaction_volume",
    "gross_profit", "cost_of_revenue", "gas_used", "transaction_count",
    "total_transaction_fees", "trading_volume"
]

dfs = {name: load_csv(name) for name in metric_names}

# Clean function
def clean(df):
    return df[["project_id", "year_month", "value"]]

# Clean all DataFrames
for k in dfs:
    dfs[k] = clean(dfs[k])
    print(f"✅ Cleaned: {k}, shape = {dfs[k].shape}")

# Join helper
def merge(left, right, suffixes=("", "_r")):
    print(f"🔗 Merging: {left.shape} + {right.shape}")
    return pd.merge(left, right, on=["project_id", "year_month"], how="inner", suffixes=suffixes)

# Derived Metrics
derived = {}

# 1. Circulating P/S Ratio
df = merge(dfs["market_cap_circulating"], dfs["revenue"])
df["value"] = df["value"] / df["value_r"].replace(0, np.nan)
derived["circulating_ps_ratio"] = df

# 2. Fully Diluted P/S Ratio
derived["fdv_ps_ratio"] = merge(dfs["market_cap_fully_diluted"], dfs["revenue"])
derived["fdv_ps_ratio"]["value"] = derived["fdv_ps_ratio"]["value"] / derived["fdv_ps_ratio"]["value_r"]
print("📊 Derived: fdv_ps_ratio")

# 3. Tokenholder Growth (monthly % growth)
df_tok = dfs["tokenholders"].sort_values(by=["project_id", "year_month"])
df_tok["value_prev"] = df_tok.groupby("project_id")["value"].shift(1)
df_tok["value"] = (df_tok["value"] - df_tok["value_prev"]) / df_tok["value_prev"]
derived["tokenholder_growth"] = df_tok.drop(columns=["value_prev"])
print("📊 Derived: tokenholder_growth")

# 4. DAU / MAU
derived["dau_mau_ratio"] = merge(dfs["user_dau"], dfs["user_mau"])
derived["dau_mau_ratio"]["value"] = derived["dau_mau_ratio"]["value"] / derived["dau_mau_ratio"]["value_r"]
print("📊 Derived: dau_mau_ratio")

# 5. Protocol Take Rate = (fees - supply_side_fees) / transaction_volume
df_take = merge(dfs["fees"], dfs["fees_supply_side"])
df_take["net_fees"] = df_take["value"] - df_take["value_r"]

df_take = merge(df_take, dfs["transaction_volume"], suffixes=("", "_tx"))
df_take["value"] = df_take["net_fees"] / df_take["value_tx"].replace(0, np.nan)

num_missing = df_take["value"].isna().sum()
print(f"📊 Derived: protocol_take_rate — {num_missing} rows with NaN (likely due to 0 transaction_volume)")
derived["protocol_take_rate"] = df_take[["project_id", "year_month", "value"]]

# 6. Gross Margin = gross_profit / revenue
derived["gross_margin"] = merge(dfs["gross_profit"], dfs["revenue"])
derived["gross_margin"]["value"] = derived["gross_margin"]["value"] / derived["gross_margin"]["value_r"]
print("📊 Derived: gross_margin")

# 7. Net Revenue = revenue - cost_of_revenue
derived["net_revenue"] = merge(dfs["revenue"], dfs["cost_of_revenue"])
derived["net_revenue"]["value"] = derived["net_revenue"]["value"] - derived["net_revenue"]["value_r"]
print("📊 Derived: net_revenue")

# 8. Gas Used per Transaction
derived["gas_per_tx"] = merge(dfs["gas_used"], dfs["transaction_count"])
derived["gas_per_tx"]["gas_per_tx"] = (
    derived["gas_per_tx"]["value"] / derived["gas_per_tx"]["value_r"].replace(0, np.nan)
)
print("📊 Derived: gas_per_tx")

# 9. Fees per Transaction
derived["fees_per_tx"] = merge(dfs["fees"], dfs["transaction_count"])
derived["fees_per_tx"]["fees_per_tx"] = (
    derived["fees_per_tx"]["value"] / derived["fees_per_tx"]["value_r"].replace(0, np.nan)
)
print("📊 Derived: fees_per_tx")

# 10. Trading Volume per User (MAU)
derived["volume_per_user"] = merge(dfs["trading_volume"], dfs["user_mau"])
derived["volume_per_user"]["value"] = derived["volume_per_user"]["value"] / derived["volume_per_user"]["value_r"].replace(0, np.nan)
print("📊 Derived: volume_per_user")

# Save all derived metrics
output_folder = "derived_metrics"
os.makedirs(output_folder, exist_ok=True)

for metric, df in derived.items():
    df_out = df[["project_id", "year_month", "value"]]
    path = os.path.join(output_folder, f"{metric}.csv")
    df_out.to_csv(path, index=False)
    print(f"✅ Saved: {path}, shape = {df_out.shape}")


📥 Loading market_cap_circulating from /content/preprocessed_token_terminal_market_cap_circulating.csv
📥 Loading market_cap_fully_diluted from /content/preprocessed_token_terminal_market_cap_fully_diluted.csv
📥 Loading revenue from /content/preprocessed_token_terminal_revenue.csv
📥 Loading tokenholders from /content/preprocessed_token_terminal_tokenholders.csv
📥 Loading user_dau from /content/preprocessed_token_terminal_user_dau.csv
📥 Loading user_mau from /content/preprocessed_token_terminal_user_mau.csv
📥 Loading fees from /content/preprocessed_token_terminal_fees.csv
📥 Loading fees_supply_side from /content/preprocessed_token_terminal_fees_supply_side.csv
📥 Loading transaction_volume from /content/preprocessed_token_terminal_transaction_volume.csv
📥 Loading gross_profit from /content/preprocessed_token_terminal_gross_profit.csv
📥 Loading cost_of_revenue from /content/preprocessed_token_terminal_cost_of_revenue.csv
📥 Loading gas_used from /content/preprocessed_token_terminal_gas_used.

In [None]:
import random

for metric_name, df in derived.items():
    print(f"\n📈 Metric: {metric_name}           ***************")

    # Drop NaN values to avoid selecting projects with all nulls
    non_null_df = df.dropna(subset=["value"])

    # Get list of unique project_ids with at least one non-null value
    projects = non_null_df["project_id"].unique()

    if len(projects) < 3:
        print(f"⚠️ Not enough non-null projects to sample from: {len(projects)} found.")
        continue

    # Pick 3 random projects
    sampled_projects = random.sample(list(projects), 3)

    for project_id in sampled_projects:
        print(f"\n🔹 Project: {project_id}")
        print(non_null_df[non_null_df["project_id"] == project_id].tail(3))



📈 Metric: circulating_ps_ratio           ***************

🔹 Project: ramses
     project_id year_month       value       value_r
5530     ramses    2025-04   78.603520  14858.387646
5531     ramses    2025-05   42.427612  29561.292656
5532     ramses    2025-06  204.001065  10256.469617

🔹 Project: thales
     project_id year_month         value    value_r
6586     thales    2025-04  8.179889e+05   8.638647
6587     thales    2025-05  2.846032e+05  20.932744
6588     thales    2025-06  2.057838e+06   4.146801

🔹 Project: pocket-network
          project_id year_month        value       value_r
5283  pocket-network    2024-07  9652.975063  11735.204683
5284  pocket-network    2024-08  6687.780320   9271.528892
5293  pocket-network    2025-05  1311.807677  22151.412243

📈 Metric: fdv_ps_ratio           ***************

🔹 Project: aethir
    project_id year_month       value       value_r
272     aethir    2025-04  214.226401  5.980069e+06
273     aethir    2025-05  244.497588  5.377085e

In [None]:
# # prompt: read /content/derived_metrics folder
# # find out all csvs
# # read all csvs
# # find out project_ids having at least 1  non-null entry and save them in list for each csv
# # use the non-null list to randomly select 3 project_ids for each csv
# # print df.tail for each of the selected project_ids

# import pandas as pd
# derived_csv_files = glob.glob(os.path.join("derived_metrics", "*.csv"))

# for csv_file_path in derived_csv_files:
#     print(f"\n--- Processing File: {os.path.basename(csv_file_path)} ---")
#     try:
#         df = pd.read_csv(csv_file_path)

#         # Find project_ids with at least one non-null value in the 'value' column
#         non_null_projects = df.dropna(subset=['value'])['project_id'].unique().tolist()

#         if len(non_null_projects) == 0:
#             print("No projects with non-null values found.")
#             continue

#         print(f"Found {len(non_null_projects)} projects with non-null values.")

#         # Randomly select up to 3 project_ids (or fewer if less than 3 available)
#         num_to_sample = min(3, len(non_null_projects))
#         sampled_project_ids = random.sample(non_null_projects, num_to_sample)

#         print(f"Randomly selected {num_to_sample} projects: {sampled_project_ids}")

#         # Print df.tail for each selected project_id
#         for project_id in sampled_project_ids:
#             print(f"\nTail of data for project_id: {project_id}")
#             project_df = df[df['project_id'] == project_id].dropna(subset=['value'])
#             if not project_df.empty:
#                 print(project_df.tail())
#             else:
#                 print("No non-null data points found for this project_id.")

#     except Exception as e:
#         print(f"Error processing {csv_file_path}: {e}")
#     print("-" * 30)

In [None]:
# # prompt: print latest gas_fee for all project_ids in derived

# # Select the gas_per_tx DataFrame
# gas_per_tx_df = derived["gas_per_tx"].copy()

# # Sort by project and year_month to get the latest entry easily
# gas_per_tx_df = gas_per_tx_df.sort_values(by=["project_id", "year_month"])

# # Get the latest entry for each project_id
# latest_gas_fee = gas_per_tx_df.groupby("project_id").tail(1)

# print("\nLatest Gas Used per Transaction (gas_per_tx) for all project_ids:")
# latest_gas_fee

In [None]:
# # prompt: print latest gas_fee for all project_ids in derived and print all project_ids for which there is even 1 non null non zero gas_fee

# # Find the gas_fees DataFrame in the dfs dictionary
# gas_fees_df = dfs.get("fees")

# if gas_fees_df is not None:
#     print("\n--- Latest Gas Fees for all project_ids ---")
#     # Sort by project_id and year_month to easily get the latest entry for each project
#     gas_fees_df_sorted = gas_fees_df.sort_values(by=['project_id', 'year_month'])

#     # Get the last entry for each project_id
#     latest_gas_fees = gas_fees_df_sorted.groupby('project_id').tail(1)

#     # Print the latest gas fees
#     if not latest_gas_fees.empty:
#         print(latest_gas_fees[['project_id', 'year_month', 'value']])
#     else:
#         print("No gas fees data found.")

#     print("\n--- Project_ids with at least one non-null, non-zero gas fee ---")
#     # Filter for rows where 'value' is not null and not zero
#     non_null_zero_gas_fees = gas_fees_df[(gas_fees_df['value'].notna()) & (gas_fees_df['value'] != 0)]

#     # Get unique project_ids from the filtered DataFrame
#     projects_with_non_null_zero_gas_fees = non_null_zero_gas_fees['project_id'].unique().tolist()

#     if projects_with_non_null_zero_gas_fees:
#         print(projects_with_non_null_zero_gas_fees)
#     else:
#         print("No project_ids found with a non-null, non-zero gas fee.")
# else:
#     print("Gas fees DataFrame not found in the loaded data.")

In [None]:

# uniswap_fees_per_tx = derived["fees_per_tx"][derived["fees_per_tx"]["project_id"] == "uniswap"]

# print("\nUniswap Fees per Transaction (fees_per_tx):")
# if not uniswap_fees_per_tx.empty:
#     print(uniswap_fees_per_tx.tail())
# else:
#     print("No fees_per_tx data found for Uniswap.")

#     # prompt: from derived show gas_fees, transaction_count for uniswap

# # Filter derived gas_per_tx for 'uniswap'
# uniswap_gas_per_tx = derived["gas_per_tx"][derived["gas_per_tx"]["project_id"] == "uniswap"]

# # Filter dfs transaction_count for 'uniswap'
# uniswap_transaction_count = dfs["transaction_count"][dfs["transaction_count"]["project_id"] == "uniswap"]

# print("\nUniswap Gas Used per Transaction (gas_per_tx):")
# if not uniswap_gas_per_tx.empty:
#     print(uniswap_gas_per_tx.tail())
# else:
#     print("No gas_per_tx data found for Uniswap.")

# print("\nUniswap Transaction Count:")
# if not uniswap_transaction_count.empty:
#     print(uniswap_transaction_count.tail())
# else:
#     print("No transaction_count data found for Uniswap.")

# # Find the gas_fees DataFrame in the dfs dictionary and filter for 'uniswap'
# uniswap_gas_fees = dfs.get("fees")
# if uniswap_gas_fees is not None:
#     uniswap_gas_fees = uniswap_gas_fees[uniswap_gas_fees["project_id"] == "uniswap"]
#     print("\nUniswap Gas Fees (derived from 'fees'):")
#     if not uniswap_gas_fees.empty:
#         print(uniswap_gas_fees.tail())
#     else:
#         print("No 'fees' data found for Uniswap.")
# else:
#     print("Gas fees DataFrame (derived from 'fees') not found in the loaded data.")


In [None]:
# # prompt: dfs["market_cap_fully_diluted"] for kwenta project_id

# kwenta_market_cap_fd = dfs["market_cap_fully_diluted"][dfs["market_cap_fully_diluted"]["project_id"] == "kwenta"]

# print("\nKwenta Market Cap Fully Diluted:")
# if not kwenta_market_cap_fd.empty:
#     print(kwenta_market_cap_fd.tail())
# else:
#     print("No 'market_cap_fully_diluted' data found for Kwenta.")
# # prompt: dfs["market_cap_fully_diluted"] for kwenta project_id

# kwenta_market_cap_fd = dfs["revenue"][dfs["revenue"]["project_id"] == "kwenta"]

# print("\nKwenta revenue Fully Diluted:")
# if not kwenta_market_cap_fd.empty:
#     print(kwenta_market_cap_fd.tail())
# else:
#     print("No 'revenue' data found for Kwenta.")

In [None]:
# prompt: zip derived_metrics

!zip -r derived_metrics.zip derived_metrics

  adding: derived_metrics/ (stored 0%)
  adding: derived_metrics/protocol_take_rate.csv (deflated 72%)
  adding: derived_metrics/tokenholder_growth.csv (deflated 67%)
  adding: derived_metrics/fees_per_tx.csv (deflated 63%)
  adding: derived_metrics/dau_mau_ratio.csv (deflated 69%)
  adding: derived_metrics/volume_per_user.csv (deflated 63%)
  adding: derived_metrics/gas_per_tx.csv (deflated 64%)
  adding: derived_metrics/net_revenue.csv (deflated 63%)
  adding: derived_metrics/gross_margin.csv (deflated 68%)
  adding: derived_metrics/circulating_ps_ratio.csv (deflated 66%)
  adding: derived_metrics/fdv_ps_ratio.csv (deflated 66%)
