# Project Inscriptions -- Data set parser

**[Johnnatan Messias](https://johnnatan-messias.github.io/), February 2025**


In [None]:
import polars as pl
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
import sys
import os
code_dir = os.path.realpath(os.path.join(os.getcwd(), "..", "src"))

sys.path.append(code_dir)

In [None]:
data_dir = os.path.realpath(os.path.join(os.getcwd(), "..", "data")) + os.sep
print(data_dir)

/Users/johnnatanmessias/research-projects/research-proj-inscriptions/data/


In [4]:
schema = pl.Schema([('block_number', pl.Int64),
                    ('tx_hash', pl.String),
                    ('tx_input_data', pl.String),
                    ('issuer', pl.String),
                    ('receiver', pl.String),
                    ('timestamp', pl.Datetime(time_unit='us', time_zone=None)),
                    ('gas_used', pl.Int64),
                    ('gas_effective_price', pl.Int64),
                    ('fees', pl.Float64),
                    ('tx_status', pl.Int64)])

In [5]:
columns_renamed = {
    'tx_hash': 'tx_hash',
    'block_number': 'block_number',
    'tx_input': 'tx_input_data',
    'tx_from_address': 'issuer',
    'tx_to_address': 'receiver',
    'receipt_gas_used': 'gas_used',
    'block_timestamp': 'timestamp',
    'effective_gas_price': 'gas_effective_price',
    'receipt_status': 'tx_status',
    'value': 'amount',
    'receipt_effective_gas_price': 'gas_effective_price'
}

collumns_selected = [
    'block_number',
    'tx_hash',
    'tx_input_data',
    'issuer',
    'receiver',
    'timestamp',
    'gas_used',
    'gas_effective_price',
    'fees',
    'tx_status',
]


def rename_columns(df):
    df = df.rename(columns=columns_renamed)
    if 'gas_effective_price' not in df.columns:
        df = df.rename(columns={'gas_price': 'gas_effective_price'})
    return df


def preprocess_dataframe(df):
    df['fees'] = df['gas_used'] * df['gas_effective_price']/1e18
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df


def to_polars(df):
    return pl.DataFrame._from_pandas(df[collumns_selected], schema=schema)


def persitst_to_parquet(df, filepath):
    df.write_parquet(filepath, compression="gzip")


def decode_input_data(input_data):
    if not input_data:
        return None
    try:
        return bytes.fromhex(input_data).decode('utf-8')
    except (ValueError, TypeError, UnicodeDecodeError):
        return None

In [6]:
chains_path = os.path.realpath(os.path.join(data_dir, "chains")) + os.sep
chains = sorted([chain for chain in os.listdir(chains_path)
                if os.path.isdir(chains_path+chain)])
chains.remove('avalanche')
chains

['arbitrum', 'base', 'bsc', 'ethereum', 'fantom', 'optimism', 'zksync']

In [7]:
for chain in tqdm(chains):
    print(chain)
    filenames = [filename for filename in os.listdir(
        chains_path + chain) if filename.endswith(".csv.gz")]
    if os.path.exists(chains_path + chain + "_inscriptions.parquet"):
        print("Already processed", chain)
        continue
    dfs = []
    for filename in filenames:
        filepath = chains_path + chain + os.sep + filename
        df = pd.read_csv(filepath)
        df = rename_columns(df)
        df = preprocess_dataframe(df)
        if df.shape[0] > 0:
            dfs.append(df)
    dfs = pd.concat(dfs)
    df_polars = to_polars(dfs)
    df_polars = df_polars.with_columns(
        pl.col('tx_input_data')
        .str.strip_chars("0x")
        .map_elements(decode_input_data, return_dtype=pl.Utf8)
        .alias("decoded_input_data")
    )
    # df_polars = df_polars.rename({'tx_input_data': 'decoded_input_data'})
    persitst_to_parquet(df_polars, filepath=chains_path +
                        chain + "_inscriptions.parquet")

  0%|          | 0/7 [00:00<?, ?it/s]

arbitrum
Already processed arbitrum
base
Already processed base
bsc
Already processed bsc
ethereum
Already processed ethereum
fantom
Already processed fantom
optimism
Already processed optimism
zksync
Already processed zksync
