# About the Data
This analysis is based on a jsonl dump of all events from one of the largest Nostr Relays. The full dataset is 175 gbs but the full set is reduced to only include zap receipts (Kind 9735).

This dataset was prefiltered using grep commands to only contain kind 9735 (Zap Receipt Events on Nostr). 

* id: the id of the event
* pubkey: the zap sender
* created_at: "Should" be set to the invoice paid_at date
* kind:
* tags:
  * Must Include:
     * p: zap recipient
     * bolt11: description hash bolt11 invoice
     * description: JSON-encoded zap request (has all the zap request data), i.e. Kind 9734
        * content: optional message sent with payment
        * tags:
           * relays: list of relays the recipients wallet should publish its zap receipt to
           * amount: payment amount in millisats (recommended but optional)
           * lnurl: lnurl pay url of recipient (recommended but optional)
           * p: pubkey of recipient
           * e: event id (if zapping to event rather than person this is required)
           * a: event coordinate
  * May Include:
     * e: event id if zapping to an event rather than a person
     * a: event coordinate
     * preimage: match against payment hash of bolt11 invoice (not proof of payment, there isn't any way to prove invoice is real or has been paid)
     * P: pubkey of zap sender
* content: empty field per nostr-protocol/nips
* sig:

Note: There isn't a way to verify zap payments are real on nostr. The zap receipt just indicates that a nostr user fetched an invoice and implies that it has been paid with a zap receipt. This data can be faked. 

More info here: https://github.com/nostr-protocol/nips/blob/master/57.md

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import hvplot.pandas #this automatically loads hvplot 'Bokeh' extension
hvplot.extension('bokeh') #for clarity
import panel as pn
import param
import requests #get current btc to usd rate
import json

#HTML 
from IPython.display import Image, HTML, display

#identify and remove extreme outliers
from scipy import stats

#pie chart
from bokeh.palettes import BuPu, Category20
from bokeh.plotting import figure
from bokeh.transform import cumsum
from math import pi



In [2]:
def get_data(jsonl_file,chunksize):
    '''
    input: jsonl file and chunksize to process 
    output: df
    '''
    chunks = pd.read_json(jsonl_file, lines=True, encoding='utf-8', chunksize=chunksize)
    df = pd.concat(chunks, ignore_index=True)
    return df

In [3]:
#function to perform initial clean
def initial_clean(df):
    '''
    input: df from get_data function (nostr data)
    output: df 'created at' converted from unix to datetime
            year added
            all data before 2021 removed (Nostr created Nov 2020)
            Month Added
            time, hour, minute added
            date added
    '''
    #convert unix timestamp to datetime, treating values less than or equal to zero as missing values
    df['created_dt'] = pd.to_datetime(df['created_at'], unit='s', errors='coerce')

    # Extract year/month from 'created_at' column
    df['year'] = df['created_dt'].dt.year
    df['month'] = df['created_dt'].dt.month

    # Filter DataFrame to include only data where year is >= 2021 
    df = df[df['created_dt'].dt.year >= 2021].copy()

    df['time'] = df['created_dt'].dt.time
    df['hour'] = df['created_dt'].dt.hour
    df['minute'] = df['created_dt'].dt.minute
    df['date'] = df['created_dt'].dt.date
    return df

In [4]:
def zap_tags_to_columns(df):
    '''
    input: df from get_data & initial_clean function (nostr data)
    output: extract tags column into it's components as separate columns
            add to original dataframe (concat)
            extract amount and content using regex
            create e_flag y/n binary column where 1 = yes, 2=no zaps for event
            remove zero value zaps 
    '''
    #transform data
    def nested_list_to_dict(nested_list):
        tags_dict = {}
        for tag in nested_list:
            tags_dict[tag[0]] = tag[1]
        return tags_dict
    
    #convert tags column (nested list) to dict
    df['tags'] = df['tags'].apply(lambda x: nested_list_to_dict(x))
    
    #expand tags dict to separate columns
    df_tags = df['tags'].apply(pd.Series)
    #combine og data with tags expanded data
    df = pd.concat([df, df_tags], axis=1)
    
    #extract amount from description field to get amount sent over zap (amount paid)
    pattern = r'"amount","(\d+)"'
    df['amount'] = df['description'].str.extract(pattern)
    #convert amount to int
    df['amount'] = pd.to_numeric(df['amount'], errors='coerce').fillna(0).astype(np.int64)
    
    #extract content field from description (what was said when zapped)
    content_pattern = r'"content":"([^"]*)"'
    df['zap_content'] = df['description'].str.extract(content_pattern)
    df['zap_content'] = df['zap_content'].replace('',pd.NA)

    #e_flag column contains 1 if e is not nan and 0 if it is (payment is an event)
    df['e_flag'] = pd.notna(df['e']).astype(int)

    # Define a mapping dictionary
    mapping = {1: 'post', 0: 'person'}
    # Apply mapping to e_flag column
    df['e_flag'] = df['e_flag'].map(mapping)
    
    #find out how many zero value "zaps" there are and remove them from df. 
    zero_amount_count = (df['amount'] == 0).sum()
    print(zero_amount_count)
    df = df[df['amount'] != 0]
    # Reset index after removing rows
    df = df.reset_index(drop=True)
    return df

In [5]:
#conversions (amount is in millisats)
def millisats_to_btc(millisats):
    btc = millisats/100000000000
    return btc

def millisats_to_sats(millisats):
    sats = millisats/1000
    return sats

def btc_to_usd_rate():
    #get current btc to usd rate
    url = 'https://bitpay.com/api/rates'
    try:
        response = requests.get(url)
        response.raise_for_status()  
        rates = response.json()
        
        if not isinstance(rates, list):
            raise ValueError('Unexpected API response format - not a list')
        
        for rate in rates:
            if rate.get('code') == 'USD':
                usd_btc_rate = float(rate.get('rate'))
                return usd_btc_rate
        raise ValueError('USD rate not found in BitPay API response')
    
    except requests.exceptions.RequestException as req_err:
        print(f"Request error occurred: {req_err}")
    except (KeyError, ValueError) as e:
        print(f"Error parsing data: {e}")

def btc_to_usd(btc, usd_btc_rate):
    usd = btc * usd_btc_rate
    return usd



In [6]:
def amount_conversions(df):
    usd_btc_rate = btc_to_usd_rate()
    print(f'1 BTC is currently {usd_btc_rate} USD')
    # Apply conversion function to 'amount' column
    df['amount_btc'] = df['amount'].apply(millisats_to_btc)
    df['amount_sats'] = df['amount'].apply(millisats_to_sats)
    df['amount_usd'] = df['amount_btc'].apply(btc_to_usd,usd_btc_rate=usd_btc_rate)
    return df

In [7]:
###identify purple to use for graphs/theme
purple_color = "#9119a6"

def get_decimal_places(amount_col):
    '''
    input: amount col such as 'amount_usd'
    output: # specific format to use for graphs/dashboard uniformity
    '''
    if amount_col == 'amount_usd':
        decimal_places = '.2f'
    elif amount_col == 'amount_btc':
        decimal_places = '.6f'
    elif amount_col == 'amount_sats':
        decimal_places = '.0f'
    elif amount_col == 'amount':
        decimal_places = '.0f'
    else:
        raise ValueError(f"Unknown amount_selected: {y}")
    return decimal_places

#-------plot functions
def generate_hvplot(df, x, y):
    '''
    input: dataframe
            y='amount' column such as 'amount', 'amount_usd'
            x='date/time' column such as 'created_dt'
    output: hvplot
    '''
    decimal_places = '%' + get_decimal_places(amount_col=y)
    plot = df.hvplot.scatter(x=x, y=y, xlabel='Date', ylabel='Amount', title='Amounts Over Time', grid=True, responsive=True, color=purple_color)
    plot.opts(yformatter=decimal_places)
    return plot


def generate_hvplot_pareto(df, x, y, person):
    # Determine decimal places based on y input
    decimal_places = '%' + get_decimal_places(amount_col=y)
    #note hvplot doesn't currently support dual axis or true pareto graphs
    '''
    input: dataframe grouped by either pubkey or p, calc sum of an amount column
            x = 'pubkey' AKA Sender or 'p' AKA recipient
            y = one of the amount columns: 'amount', 'amount_usd', 'amount_btc'
            person = specify 'Sender' or 'Recipient' for the chart title label for clarity
    output: hvplot descending barchart 
    '''

    # Limit to top 10 values
    df_top10 = df.head(10)
    
    # Define hover tooltip using HTML
    h_tooltips = """
    <div>
        <div>
            <img
                src="@{%s}" height="100" alt="robo-pubkey" width="100"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>

    </div>
    """ % (person)
    
    # Plot using HoloViews hvplot
    pareto_plot = df_top10.hvplot.bar(x=x, y=y, xlabel='hover reveals robo "pubkey"', ylabel=y.capitalize(),
                                      title=f"Top 10 total {y.capitalize()} by {person} (Pareto)",
                                      width=400, height=400, alpha=0.6, color=purple_color,
                                      line_color=None, hover_cols=[person])
    pareto_plot.opts(tools=['hover'], hover_tooltips=h_tooltips, xaxis='bare')
    # Customize y-axis tick format
    pareto_plot.opts(yformatter=decimal_places)
    return pareto_plot

# Create a pie chart
def make_pie_chart(df):
    '''
    input: dataframe with pubkey sender and p recipient columns
    output: pie chart reflecting total unique users (senders/recipients and both)
    '''
    unique_recipients = df['p'].nunique()
    unique_senders = df['pubkey'].nunique()
    senders_and_recipients = pd.concat([df['p'], df['pubkey']])
    unique_combined = senders_and_recipients.nunique()
    overlapping_keys = (unique_recipients + unique_senders) - unique_combined
    #print(f'unique recipients: {unique_recipients}, unique senders: {unique_senders}, unique_combined: {unique_combined} and # of keys overalapping {overlapping_keys}')
    # Calculate group sizes
    only_recipients = unique_recipients - overlapping_keys
    only_senders = unique_senders - overlapping_keys
    both = overlapping_keys

    x = {'only recipients': only_recipients, 'only senders': only_senders, 'both': both}
    data = pd.Series(x).reset_index(name='value').rename(columns={'index':'unique'})
    data['angle'] = data['value']/data['value'].sum() * 2*pi
    data['color'] = BuPu[len(x)]
    
    p = figure(height=300, title='# of Unique Senders/Recipients', toolbar_location=None,
               tools='hover', tooltips='@unique: @value', x_range=(-0.5, 1.0))
    
    r = p.wedge(x=0, y=1, radius=0.4,
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color='white', fill_color='color', legend_field='unique', source=data)
    
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None
    p.legend.location = 'bottom_right'
    return p

def path_to_image_html(path):
    '''
    input: df row containing link to image
    output: html image path
    '''
    return '<img src="'+ path + '" width="30" >'


def display_transaction(row, amount_selected):
    '''
    input: dataframe row containing columns created in other functions
            columns = sender, recipient, zap content, amount
    output: html for table layout containing:
            sender (left), amount & message (center), recipient (right)
    '''

    #get decimal places for formatting based amount type
    decimal_places = get_decimal_places(amount_selected)
    
    # Format the amount based on amount_selected
    formatted_amount = f"{row[amount_selected]:{decimal_places}}"
    
    #html = '<div style="display: flex; align-items: left;">'
    html = '<div style="display: flex; align-items: stretch; border: 1px solid #ccc; padding: 10px; row-gap:10px;">'
    
    # Display sender image
    html += f'<div><img src="{row["sender"]}" style="width: 50px; height: 50px; object-fit: contain;"></div>'
    
    # Display amount and content
    html += f'<div style="margin-left: auto;"><p><b>Amount:</b> {formatted_amount}</p><p><b>Content:</b> {row["zap_content"]}</p></div>'
    
    # Display recipient image
    html += f'<div style="margin-left: auto;"><img src="{row["recipient"]}" style="width: 50px; height: 50px; object-fit: contain;"></div>'
    
    html += '</div>'
    
    return html
    

In [8]:
def pubkey_to_robohash(pubkeys):
    '''
    input: pubkey
    output: robohash url 
    '''
    robohashes = {}
    base_url = "https://robohash.org/{}.png"
    
    for pubkey in pubkeys:
        url = base_url.format(pubkey)
        robohashes[pubkey] = url
        
    return robohashes


def senders_and_recipients_columns(df):
    '''
    input: df containing 'p', 'pubkey'
    Note: uses function call to get robohash 'pubkey_to_robohash'
    output: robohash column corresponding to pubkey(sender)/p(recipient)
            apply path to image function to df to display pubkey as robohash image.
    '''
    senders_and_recipients = pd.concat([df['p'], df['pubkey']])
    unique_senders_recipients = senders_and_recipients.drop_duplicates(keep='last')
    robo_dict = pubkey_to_robohash(unique_senders_recipients)
    df['sender'] = df['pubkey'].apply(lambda x: robo_dict[x])
    df['recipient'] = df['p'].apply(lambda x: robo_dict[x])

    df['zap_sender'] = df['sender'].apply(path_to_image_html)
    df['zap_recipient'] = df['recipient'].apply(path_to_image_html)
    return df

def remove_outliers(df):
    '''
    input: df
    output: df minus amount col outlier rows 
    '''
    # Calculate Z-score for amount columns
    z_scores = stats.zscore(df[['amount_btc', 'amount', 'amount_usd']])
    
    # Identify outliers based on Z-score threshold (|Z-score| > threshold)
    threshold = 3
    
    # Remove outliers based on Z-score
    df = df[(abs(z_scores) <= threshold).all(axis=1)]
    return df


# Panel App & Data Load

In [9]:
#--------get data using previously defined functions---------
@pn.cache #to speed up dashboard
def df_creator_chain(jsonl_file, chunksize):
    df = get_data(jsonl_file, chunksize)
    df = initial_clean(df)
    df = zap_tags_to_columns(df)
    df = amount_conversions(df)
    df = senders_and_recipients_columns(df)
    df = remove_outliers(df)
    return df
jsonl_file = 'datasets/zaps_20k.jsonl'
#jsonl_file = 'datasets/zaps_1m.jsonl'
#jsonl_file = 'datasets/all_zaps.jsonl'
df = df_creator_chain(jsonl_file, 10000)
df.keys()
len(df)

628272
1 BTC is currently 62971.51 USD


371538

In [10]:
#Building the Panel Dashboard
pn.extension('tabulator',sizing_mode="stretch_width", design="fast")

#----------define columns to use-----------------------------------------
datetime_col = 'created_dt'
sender_col = 'sender'
recipient_col = 'recipient'
event_flag_col = 'e_flag'
default_amount_col = 'amount_sats'
df_pane_cols = ['zap_sender', 'amount', 'zap_recipient', 'zap_content']
amount_cols = [col for col in df.columns if 'amount' in col]

#------define styles-------------------------------------------------------
ACCENT = purple_color
stats_styles={'border':'solid 1px', 'min-height':'50px',
              'background-color': 'transparent', 'padding': '30px',
              'box-shadow': '10px 10px 5px lightgray'}
about_styles={'color': purple_color, 'font-size': '15px',
              'min-height':'50px'}
other_styles={'color': 'black', 'font-size': '14px'}

#-----define sizes for uniformity-------------------------------------------
default_height = 300
default_width = 300


#-----Nostr zap image------------------------------------------------------------
png_pane = pn.pane.Image('https://scrapbox.io/files/6482a5c14f4efb001ca406a7.png')

#----define widgets to add interactivity------------------------------------------
e_flag_widget = pn.widgets.CheckBoxGroup(name="Zapping a Post=1, Person=0",
                                         options=['post','person'],
                                         value=['post','person'],
                                         inline=True)
start_date_range = df[datetime_col].min()
end_date_range = df[datetime_col].max()
datetime_slider = pn.widgets.DatetimeRangeSlider(name='Date Range',
                                                 start=start_date_range,
                                                 end=end_date_range,
                                                 value=(start_date_range,
                                                        end_date_range))
amount_widget = pn.widgets.Select(name="Amount Type",
                                  options=amount_cols,
                                  value=default_amount_col)

###########################function to handle updates#######################################
def update_data(event):
    #updates data when user interacts with widgets
    start_date, end_date = datetime_slider.value
    selected_e_flag = [val for val in e_flag_widget.value]
    selected_amount_type = amount_widget.value

    # Determine format based on amount type
    format_string = '{value:,'+ get_decimal_places(selected_amount_type) + '}'

    #filter dataframe based on user selection
    filtered_df = df[(df[datetime_col]>= start_date) &
                    (df[datetime_col] <= end_date) &
                    (df[event_flag_col].isin(selected_e_flag))]

    # Update indicators based on filtered data
    total_zaps.value = filtered_df[selected_amount_type].sum()
    total_zaps.format = format_string
    average_zaps.value = filtered_df[selected_amount_type].mean()
    average_zaps.format = format_string
    max_zaps.value = filtered_df[selected_amount_type].max()
    max_zaps.format = format_string
    mode_zaps.value = float(filtered_df[selected_amount_type].mode().iloc[0])
    mode_zaps.format = format_string

    #get top senders/receivers based on filtered_df
    top_senders = filtered_df.groupby(sender_col)[selected_amount_type].sum().sort_values(ascending=False)
    top_recipients = filtered_df.groupby(recipient_col)[selected_amount_type].sum().sort_values(ascending=False)
    
    #update sender/receiver plot panes
    updated_senders_pareto = generate_hvplot_pareto(top_senders,
                                                    x=sender_col,
                                                    y=selected_amount_type,
                                                    person=sender_col)
    updated_receivers_pareto = generate_hvplot_pareto(top_recipients,
                                                      x=recipient_col,
                                                      y=selected_amount_type,
                                                      person=recipient_col)
    senders_plot_pane.object = updated_senders_pareto
    receivers_plot_pane.object = updated_receivers_pareto
    
    #update Dataframe pane (show top 10 by zap amount only)
    sorted_df = filtered_df.sort_values(by=selected_amount_type, ascending=False).head(10)
    html_content = ''.join([display_transaction(row, amount_selected=selected_amount_type) for index, row in sorted_df.iterrows()])
    df_pane.object = HTML(html_content)

    #update plot pane
    updated_plot = generate_hvplot(filtered_df,
                                   x=datetime_col,
                                   y=selected_amount_type)
    plot_pane.object = updated_plot

    #update pie plot pane
    updated_pie_plot = make_pie_chart(filtered_df)
    pie_pane.object = updated_pie_plot
###############################################################################################    

#------------Update callback for widgets-------------------------------------------
datetime_slider.param.watch(update_data,'value')
e_flag_widget.param.watch(update_data, 'value')
amount_widget.param.watch(update_data, 'value')

# Initial filtered DataFrame based on default values of sliders and selector
initial_filtered_df = df[(df[datetime_col]>= datetime_slider.value[0]) & 
                         (df[datetime_col] <= datetime_slider.value[1]) & 
                         (df[event_flag_col].isin([val for val in e_flag_widget.value]))]


#-----initialize plot, df, KPIs-------------------------------------------------------
#time series plot
initial_plot = generate_hvplot(initial_filtered_df, x=datetime_col, y=default_amount_col)
#--pareto plots
top_senders = initial_filtered_df.groupby(sender_col)[default_amount_col].sum().sort_values(ascending=False).reset_index()
top_recipients = initial_filtered_df.groupby(recipient_col)[default_amount_col].sum().sort_values(ascending=False).reset_index()
initial_senders_pareto = generate_hvplot_pareto(top_senders,
                                                x=sender_col,
                                                y=default_amount_col,
                                                person=sender_col)
initial_receivers_pareto = generate_hvplot_pareto(top_recipients,
                                                  x=recipient_col,
                                                  y=default_amount_col,
                                                  person=recipient_col)

#------initialize dataframe pane using HTML----------------------------------------
initial_sorted_df = initial_filtered_df.sort_values(by=default_amount_col, ascending=False).head(10)
html_content = ''.join([display_transaction(row, amount_selected=default_amount_col) for index, row in initial_sorted_df.iterrows()])
df_pane = pn.pane.HTML(HTML(html_content), height=default_height, sizing_mode="stretch_width")

#------initialize all Plot Panes----------------------------------------------------
plot_pane = pn.pane.HoloViews(initial_plot,
                              sizing_mode="stretch_width",
                              height=default_height)
senders_plot_pane = pn.pane.HoloViews(initial_senders_pareto,
                                      sizing_mode="stretch_width",
                                      height=default_height)
receivers_plot_pane = pn.pane.HoloViews(initial_receivers_pareto,
                                        sizing_mode="stretch_width",
                                        height=default_height)
pie_pane = pn.pane.Bokeh(make_pie_chart(initial_filtered_df),
                          sizing_mode="stretch_both",
                          max_height=default_height)

#-------initialize KPIs for KPI pane------------------------
total_zaps_initial = float(initial_filtered_df[default_amount_col].sum())
average_zaps_initial = float(initial_filtered_df[default_amount_col].mean())
max_zaps_initial = float(initial_filtered_df[default_amount_col].max())
mode_zaps_initial = float(initial_filtered_df[default_amount_col].mode().iloc[0])

font_size_indicators = '12pt'
title_size_indicators = '14pt'

# Determine format based on amount type
format_string = '{value:,'+ get_decimal_places(default_amount_col) + '}'

total_zaps = pn.indicators.Number(name='Total Zaps: ',
                                  value=total_zaps_initial,
                                  format=format_string,
                                  font_size=font_size_indicators,
                                  title_size=title_size_indicators,
                                  styles=stats_styles)
average_zaps = pn.indicators.Number(name='Avg Zaps: ',
                                    value=average_zaps_initial,
                                    format=format_string,
                                    font_size=font_size_indicators,
                                    title_size=title_size_indicators,
                                    styles=stats_styles)
max_zaps = pn.indicators.Number(name='Max Zaps: ',
                                value=max_zaps_initial,
                                format=format_string,
                                font_size=font_size_indicators,
                                title_size=title_size_indicators,
                                styles=stats_styles)
mode_zaps = pn.indicators.Number(name='Mode Zaps: ',
                                 value=mode_zaps_initial,
                                 format=format_string,
                                 font_size=font_size_indicators,
                                 title_size=title_size_indicators,
                                 styles=stats_styles)

#-----initialize gridspec for stats & piechart---------------------------
stats_gridspec = pn.GridSpec(sizing_mode='stretch_width', max_height=default_height)
stats_gridspec[0,   0:2] = total_zaps
stats_gridspec[0,   2:4] = average_zaps
stats_gridspec[1,   0:2] = max_zaps
stats_gridspec[1,   2:4] = mode_zaps
stats_gridspec[0:2, 5:9] = pie_pane

#-------define sidebar widget column--------------------------------------
e_flag_widget_txt = pn.widgets.StaticText(name='',
                                          value='Include zaps to person/post or both?: ',
                                          styles=other_styles)
gridspec = pn.GridSpec(sizing_mode='stretch_both', max_height=default_height)

user_selections = gridspec[3,3]=pn.Column(datetime_slider,
                                          e_flag_widget_txt,
                                          e_flag_widget,
                                          amount_widget)
#--------------Set up row layouts---------------------------------------
r1 = pn.Row(stats_gridspec)
r2 = pn.Row(plot_pane, df_pane)
r3 = pn.Row(senders_plot_pane, receivers_plot_pane)


#---------define what the project is about-----------------------------------
about1 = pn.widgets.StaticText(name='Whats the tipping culture on Nostr?',
                               value='data is a subset of all data from one of the most popular nostr relays',
                               styles=about_styles)
about2 = pn.widgets.StaticText(name='About Nostr Zaps',
                               value='Nostr is a new social network that has a tipping option called zaps, a user can zap another user which pays their bitcoin wallet', 
                               styles=about_styles)

#----Instantiate template with sidebar widgets-------------------------------
template = pn.template.FastListTemplate(
    title='Nostr Zap Explorer',
    logo='https://scrapbox.io/files/6482a5c14f4efb001ca406a7.png',
    sidebar=[about1, png_pane, about2, user_selections],
    accent=ACCENT,
)
# Append row layouts to main area (list-like api)
template.main.append(r1)
template.main.append(r2)
template.main.append(r3)

#serve template
template.servable();
