#### Import required modules


In [1138]:
import pandas as pd
import plotly.express as px
import numpy as np
from datetime import datetime, timezone
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from ast import literal_eval
from scipy import stats
import difflib
from scipy.stats import zscore

#### Load all dataframes


In [1139]:
zkp_repos = pd.read_csv('zkp_repos.csv', sep=';')
tool_commits = pd.read_csv('tool_commits.csv')
application_commits = pd.read_csv('application_commits.csv')
tool_issues = pd.read_csv('tool_issues.csv')
repo_contributors = pd.read_csv('repo_contributors.csv')
contributor_data = pd.read_csv('contributor_data.csv')
branches_data = pd.read_csv('branches_data.csv')
# application_authors = pd.read_csv('application_authors.csv')
application_authors = pd.read_csv('application_contributors.csv')

In [1140]:
application_authors = application_authors[application_authors['Type'] == 'User']
remove_tools = ['cairo/starkware-libs', 'noir/noir-lang', 'starknet-rs/xjonathanlei', 'zokrates/zokrates', 'circom-compat/arkworks-rs', 'snarky/o1-labs']
tool_commits = tool_commits[~tool_commits['UniqueID'].isin(remove_tools) ]
tool_issues = tool_issues[~tool_issues['UniqueID'].isin(remove_tools) ]
repo_contributors = repo_contributors[~repo_contributors['UniqueID'].isin(remove_tools)]

#### Calculate AppCount

In [1141]:
zkp_repos.loc[zkp_repos["Type"] == "Application", "Tool"] = zkp_repos["Tool"].str.replace('[', '').str.replace(']', '').str.replace('\'', '').str.split(', ')
zkp_repos_exploded = zkp_repos.explode('Tool')

In [1142]:
tool_counts = zkp_repos_exploded["Tool"].value_counts().reset_index()

tool_counts.columns = ["Tool", "AppCount"]
tool_counts = tool_counts.merge(
    zkp_repos[zkp_repos["Type"] == "Tool"], left_on="Tool", right_on="Name", how="outer"
)[["UniqueID", "AppCount"]]

#### Application Author Count By Bin

In [1143]:
application_committers = application_commits.merge(application_authors, left_on='Author', right_on='Login', how='left')
application_committers.dropna(subset=['Login'], inplace=True)
application_committers = application_committers.groupby('UniqueID')['Author'].nunique().reset_index()
application_committers.rename(columns={'Author': 'AuthorCount'}, inplace=True)
application_committers.sort_values(by=['AuthorCount'], ascending=False, inplace=True)
bins = np.logspace(np.log10(0.5), np.log10(max(application_committers['AuthorCount'])), num=10)
bins = np.round(bins)
application_committers['AuthorGroup'] =  pd.cut(application_committers['AuthorCount'], bins=bins)
bin_counts = application_committers['AuthorGroup'].value_counts().sort_index().reset_index()
bin_counts['AuthorGroup'] = bin_counts['AuthorGroup'].astype(str)
bin_counts.rename(columns={'count': 'Count'}, inplace=True)

fig = px.bar(bin_counts,  
             x='AuthorGroup', 
             y='Count',
             title='Application Author Count Distribution per Bin',
             template='plotly_dark', 
             )

fig.show()

From the graph above it is clear that the first bin (0,1] contains the most (361) applications. This means that 361 out of the 905 applications have 1 author. 

The second largest bin is (2, 4] which contains 162 applications. The third largest bin is (1,2] which contains 149 applications. 

These bins can be joined to form the bin (0, 5] which contains 707 applications. This accounts for about ~77% of all applications. This shows that most applications have 5 or less authors. 

##### AuthorCount Outliers

In [1144]:
author_outliers = application_committers
z_scores = stats.zscore(application_committers['AuthorCount'])
outliers = (np.abs(z_scores) > 3)
author_outliers['Outliers'] = outliers
author_outliers = author_outliers[author_outliers['Outliers'] == True]
author_outliers.rename(columns={'Outliers': 'AuthorOutliers'}, inplace=True)
author_outliers.sort_values('AuthorCount')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,UniqueID,AuthorCount,AuthorGroup,AuthorOutliers
322,kakarot/kkrt-labs,52,"(38.0, 79.0]",True
554,snarkos/aleohq,55,"(38.0, 79.0]",True
820,zkevm-circuits/kroma-network,55,"(38.0, 79.0]",True
654,starknet.js/0xs34n,60,"(38.0, 79.0]",True
356,madara/keep-starknet-strange,66,"(38.0, 79.0]",True
666,starknetbook/starknet-edu,69,"(38.0, 79.0]",True
116,cairo/starkware-libs,76,"(38.0, 79.0]",True
35,baseline/eea-oasis,76,"(38.0, 79.0]",True
373,mina/minaprotocol,88,"(79.0, 162.0]",True
609,starknet-ecosystem.com/419labs,137,"(79.0, 162.0]",True


#### Application Author Count


In [1145]:
fig = px.scatter(application_committers,  x='UniqueID', y='AuthorCount',
                   color='AuthorGroup',
                   title='Application Author Count Distribution',
                   template='plotly_dark', 
                )
fig.show()

The graph above shows the individual author counts for each application. 


`bitcoinprivate-legacy/btcprivate` has an author count of 455. Apart for this repo, the range from author counts is betweeen 1 and 133. 

Why does `bitcoinprivate-legacy/btcprivate` have such a high author count compared to other repositories? [`bitcoinprivate-legacy/btcprivate`](https://github.com/BTCPrivate/BitcoinPrivate-legacy) seems to be a popular repository based in the # of forks and # of stars. This may be why there are many authors contributing to the project. 

`bitcoinprivate-legacy/btcprivate` is (was) an open-source project where contributions are encouraged and welcome by the creation of code reviews. The workflow seems to be a combination of the Fork-and-Pull Workflow (where contributors create branches to implement a feature or fix an issue which is them merged to the main branch) and the Centralized Workflow (where some contributors commit directly to the main repository). 

#### Application Commit Count by Bin

In [1146]:
commit_counts = application_commits.groupby('UniqueID')['CommitHash'].count().reset_index()
commit_counts.rename(columns={'CommitHash': 'CommitCount'}, inplace=True)
bins = [min(commit_counts['CommitCount']) - 0.5, *np.logspace(np.log10(min(commit_counts['CommitCount'])), np.log10(max(commit_counts['CommitCount'])), num=20)]
bins = np.round(bins)
commit_counts['CommitGroup'] =  pd.cut(commit_counts['CommitCount'], bins=bins)
bin_commit_counts = commit_counts['CommitGroup'].value_counts().sort_index().reset_index()
bin_commit_counts.rename(columns={'count': 'Count'}, inplace=True)
bin_commit_counts['CommitGroup'] = bin_commit_counts['CommitGroup'].astype(str)

fig = px.bar(bin_commit_counts,  x='CommitGroup', y='Count',
                   title='Application Commit Count by Bins',
                   template='plotly_dark', 
                )

fig.show()

The distribution of commit counts seems to be highest towards the middle bins (skewed slightly towards the right). 

Higher commit counts could indicate projects that are maintained over time, whereas lower commit counts could indicate projects that existed for a limited time (or are no longer being maintained). 

##### CommitCount Outliers

In [1147]:
commit_outliers = commit_counts
z_scores = stats.zscore(commit_counts['CommitCount'])
outliers = (np.abs(z_scores) > 3)
commit_counts['Outliers'] = outliers
commit_outliers = commit_counts[commit_counts['Outliers'] == True]
commit_outliers.rename(columns={'Outliers': 'CommitOutliers'}, inplace=True)
commit_outliers.sort_values('CommitCount')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,UniqueID,CommitCount,CommitGroup,CommitOutliers
117,cairo/starkware-libs,3998,"(3188.0, 5459.0]",True
906,zokrates/scrypt-inc,4389,"(3188.0, 5459.0]",True
415,o1js/o1-labs,4397,"(3188.0, 5459.0]",True
25,argent-x/argentlabs,4581,"(3188.0, 5459.0]",True
907,zokrates/zokrates,4591,"(3188.0, 5459.0]",True
436,pathfinder/eqlabs,4598,"(3188.0, 5459.0]",True
200,darkfi/darkrenaissance,7001,"(5459.0, 9347.0]",True
561,snarkos/aleohq,8567,"(5459.0, 9347.0]",True
563,snarkvm/aleohq,9691,"(9347.0, 16005.0]",True
48,bitcoinprivate-legacy/btcprivate,11051,"(9347.0, 16005.0]",True


#### Application Lifespan by AgeGroup

In [1148]:
zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']

first_commit = application_commits.groupby('UniqueID')['AuthorDate'].min().reset_index()
first_commit.rename(columns={'AuthorDate': 'FirstCommitDate'}, inplace=True)
first_commit = zkp_applications.merge(first_commit[['UniqueID', 'FirstCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()
zkp_applications = zkp_applications.merge(first_commit[['UniqueID', 'FirstCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()

last_commit = application_commits.groupby('UniqueID')['AuthorDate'].max().reset_index()
last_commit.rename(columns={'AuthorDate': 'LastCommitDate'}, inplace=True)
last_commit = zkp_applications.merge(last_commit[['UniqueID', 'LastCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()
zkp_applications = zkp_applications.merge(last_commit[['UniqueID', 'LastCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()

zkp_applications['FirstCommitDate'] = pd.to_datetime(zkp_applications['FirstCommitDate'], utc=True)
zkp_applications['LastCommitDate'] = pd.to_datetime(zkp_applications['LastCommitDate'], utc=True)
zkp_applications['Age'] = (zkp_applications['LastCommitDate'] - zkp_applications['FirstCommitDate']).dt.days
zkp_applications.loc[zkp_applications['Age'] == 0,'Age'] = 0.5
lifespan_bins = np.logspace(np.log10(min(zkp_applications['Age'])), np.log10(max(zkp_applications['Age'])), num=10)
lifespan_bins = np.round(lifespan_bins)
zkp_applications['AgeGroup'] =  pd.cut(zkp_applications['Age'], bins=lifespan_bins, duplicates='drop')
lifespan_bin_counts = zkp_applications['AgeGroup'].value_counts().sort_index().reset_index()
lifespan_bin_counts.rename(columns={'count': 'Count'}, inplace=True)
lifespan_bin_counts['AgeGroup'] = lifespan_bin_counts['AgeGroup'].astype(str)

fig = px.bar(lifespan_bin_counts,  x='AgeGroup', y='Count',
                   title='Application Lifespan (in days) by Bins',
                   template='plotly_dark', 
                )

fig.show()

By looking at the lifespans, the application purpose could potentially be deduced - short lifespans could indicate smaller projects, experimentation, academic research whereas longer lifespans could indicate ongoing projects. 

The majority of applications existed for less than a year (365 days). This could give some indication about the type of these applications - repositories created for experimentation, research or academic purposes would have a shorter lifespan than those intended for industry projects. 

The bin (0,1] contains a large amount of applications compared to the other bins - 83 applications existed for a day or less. 

##### Application Lifespan Outliers

In [1149]:
lifespan_outliers = zkp_applications.merge(application_committers, left_on='UniqueID', right_on='UniqueID', how='left')
z_scores = stats.zscore(lifespan_outliers['Age'].dropna())
outliers = (np.abs(z_scores) > 3)
lifespan_outliers['Outliers'] = outliers
lifespan_outliers
lifespan_outliers = lifespan_outliers[lifespan_outliers['Outliers'] == True][['UniqueID', 'Age', 'AuthorCount']]
lifespan_outliers.rename(columns={'Outliers': 'LifespanOutliers'}, inplace=True)
lifespan_outliers.sort_values(by=['Age', 'AuthorCount'])

Unnamed: 0,UniqueID,Age,AuthorCount
557,snarkos/aleohq,1532.0,55.0
134,celo-bls-snark-rs/celo-org,1576.0,9.0
6,abpr22/baghery,1599.0,14.0
530,semaphore/semaphore-protocol,1623.0,18.0
337,lattice-snarg/dwu4,1646.0,1.0
758,zeth/clearmatics,1651.0,6.0
738,xjsnark/akosba,1655.0,1.0
402,noah/findoranetwork,1729.0,37.0
27,ark-bulletproofs/findoranetwork,1925.0,24.0
802,zk-swap-libsnark/eyblockchain,2003.0,23.0


In [1150]:
authors_vs_lifespan = zkp_applications.merge(application_committers, left_on='UniqueID', right_on='UniqueID')

fig = px.scatter(authors_vs_lifespan, 
            x='Age', 
            y='AuthorCount',
            title='Relationship between the Lifespan & AuthorCount',
            template='plotly_dark', 
            hover_name='UniqueID', 
            trendline='ols'
            )

fig.show()

As seen in the graph above, there is a weak correlation (R^2 = 0.297) between the Age of an application and its AuthorCount.

`btcprivate-legacy` being an outlier as it has a high author count and age

#### Combine Metrics Age, AuthorCounts and CommitCount

In [1151]:
combined = application_committers.merge(zkp_applications[['UniqueID', 'Age', 'LastCommitDate', 'Tool']], left_on='UniqueID', right_on='UniqueID')
combined = combined.merge(commit_counts, left_on='UniqueID', right_on='UniqueID')
combined['Active'] = (combined['LastCommitDate'].dt.year > 2022).astype(int)

#### Tool Usage Among Single-Author, Short-Lived and Inactive Applications

In [1156]:
single_apps = combined[combined['AuthorCount'] == 1]
single_apps = single_apps[single_apps['Age'] < 366]
single_apps = single_apps[single_apps['Active'] == 0] # TODO: decide about this metric 

single_apps = single_apps.explode('Tool')
single_apps['Tool'] = single_apps['Tool'].str.replace('‘', '')

single_tools = single_apps.groupby('Tool')['UniqueID'].count().reset_index()
single_tools.rename(columns={'UniqueID': 'Count'}, inplace=True)

single_tools['Percentage'] = np.round(((single_tools['Count'] / single_apps['UniqueID'].nunique()) * 100), 2) 

single_tools[['Tool', 'Percentage']].sort_values('Percentage', ascending=False)
fig = px.bar(single_tools.sort_values('Count', ascending=False),  
            x='Tool', 
            y='Count',
            title='Tool Usage Among Single-Author, Short-Lived and Inactive Applications',
            template='plotly_dark', 
            text='Percentage'
            )

fig.show()

#### Tool Usage Among Applications with Moderate Metric Ranges

In [1158]:
average_apps = combined[combined['AuthorCount'] > 1]
average_apps = average_apps[average_apps['AuthorCount'] < 53]
average_apps = average_apps[average_apps['CommitCount'] < 3998]
average_apps = average_apps[average_apps['Age'] > 365]
average_apps = average_apps[average_apps['Age'] < 1532]
average_app_tools = average_apps.explode('Tool')
average_app_tools['Tool'] = average_app_tools['Tool'].str.replace('‘', '')
average_app_tools = average_app_tools.groupby('Tool')['UniqueID'].count().reset_index()
average_app_tools.rename(columns={'UniqueID': 'Count'}, inplace=True)

average_app_tools['Percentage'] = np.round(((average_app_tools['Count'] / average_apps['UniqueID'].nunique()) * 100), 2)

average_app_tools[['Tool', 'Percentage']].sort_values('Percentage', ascending=False)
fig = px.bar(average_app_tools.sort_values('Count', ascending=False),  
            x='Tool', 
            y='Count',
            title='Tool Usage Among Application Repositories with Moderate Metric Values',
            template='plotly_dark', 
            text='Percentage'
            )

fig.show()

#### Tool Usage Among Applications with Outlying Metric Values

In [1159]:
# compute outliers 
z_scores = zscore(combined[['Age', 'AuthorCount', 'CommitCount']])
threshold = 3
outlier_indices = (z_scores > threshold).any(axis=1)
outliers = combined[outlier_indices]

outliers = outliers[['UniqueID', 'Age', 'AuthorCount', 'CommitCount', 'Tool', 'Active']]

outliers = outliers.explode('Tool')
outliers['Tool'] = outliers['Tool'].str.replace('‘', '')
outlier_tools = outliers.groupby('Tool')['UniqueID'].count().reset_index()
outlier_tools.rename(columns={'UniqueID': 'Count'}, inplace=True)

total_outliers = outliers['UniqueID'].nunique()
outlier_tools['Percentage'] = np.round(((outlier_tools['Count'] / total_outliers) * 100), 2)

outlier_tools[['Tool', 'Percentage']].sort_values('Percentage', ascending=False)
fig = px.bar(outlier_tools.sort_values('Count', ascending=False),  
            x='Tool', 
            y='Count',
            title='Tool Usage Among Outlying Applications',
            template='plotly_dark', 
            text='Percentage'
            )
fig.show()


In [1155]:
# SINGLE & AVERAGE
set(single_apps['Tool'].to_list()) - set(average_apps['Tool'].to_list())
# = 'arkworks/gemini', 'openzkp'
set(average_apps['Tool'].to_list()) - set(single_apps['Tool'].to_list())
# = 'arkworks/marlin','arkworks/poly-commit','arkworks/sponge','leo','miden-vm','pysnark','risc0'

# SINGLE & OUTLIER
set(single_apps['Tool'].to_list()) - set(outlier_tools['Tool'].to_list())
# = 'arkworks/gemini','arkworks/nonnative','bulletproofs','gnark','openzkp','winterfell','zksync'
set(outlier_tools['Tool'].to_list()) - set(single_apps['Tool'].to_list())
# = ' arkworks/gm17', 'arkworks/marlin', 'arkworks/poly-commit', 'leo'

# AVERAGE & OUTLIER
set(average_apps['Tool'].to_list()) - set(outlier_tools['Tool'].to_list())
#  = 'arkworks/nonnative','arkworks/sponge','bulletproofs','gnark','miden-vm','pysnark','risc0','winterfell','zksync'
set(outlier_tools['Tool'].to_list()) - set(average_apps['Tool'].to_list())
# = 'arkworks/gm17'

set(single_apps['Tool'].to_list()) & set(average_apps['Tool'].to_list()) & set(outlier_tools['Tool'].to_list())

# 17 tools 
#  'arkworks/algebra',
#  'arkworks/crypto-primitives',
#  'arkworks/curves',
#  'arkworks/gm17',
#  'arkworks/groth16',
#  'arkworks/r1cs-std',
#  'arkworks/snark',
#  'arkworks/std',
#  'bellman',
#  'cairo-lang',
#  'circom',
#  'circomlib',
#  'halo2',
#  'libsnark',
#  'merlin',
#  'plonky2',
#  'snarkjs'

# len(set(single_apps['Tool'].to_list())) # 24
# len(set(average_apps['Tool'].to_list())) # 29
# len(set(outlier_tools['Tool'].to_list())) # 21

TypeError: unhashable type: 'list'

#### Repository Sizes of Short-lived Applications

In [None]:
shortlived_apps = zkp_applications[zkp_applications['Age'] <= 1]
shortlived_apps.sort_values(by='Size', inplace=True)

fig = px.bar(shortlived_apps, 
            x='UniqueID', 
            y='Size',
            title='Application Size of Projects with Lifespan <= 1 Day',
            template='plotly_dark', 
            text='Size'
            )

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



From the graph above it can be seen that a large portion of short-lived applications have small repos sizes. This could suggest that a project was started and neglected (or contains a small example implementation from a tutorial). Larger repository sizes could indicate that a project was created and developed prior and then only pushed to GitHub. 

#### Application Lifespan

In [None]:
zkp_applications.sort_values(by='Age', ascending=True, inplace=True)
fig = px.scatter(zkp_applications, 
                x='UniqueID', 
                y='Age',
                color='AgeGroup',
                title='Application Lifespan',
                template='plotly_dark', 
                category_orders={'UniqueID': zkp_applications['UniqueID']}, 
                height=1000
                )
fig.update_layout(showlegend=False)
fig.show()

This graph illustrates the distribution of the lifespans of the individual applications, coloured by their group. The last group (3376, 4057] contains a singular repository `bitcoinprivate-legacy/btcprivate`.

The last few groups appear to have a greater difference in lifespans, compared to the steady increase of shorter lifespan groups.  

Here are some of the applications with the highest lifespans: 

|     | UniqueID                                |   Age | LastCommitDate            |   Stars |   Forks |   Watchers |
|----:|:----------------------------------------|------:|:--------------------------|--------:|--------:|-----------:|
|  48 | bitcoinprivate-legacy/btcprivate        |  4057 | 2020-10-08 08:28:52+00:00 |     274 |     130 |        274 |
|  45 | bellpepper/lurk-lab                     |  2859 | 2023-10-22 15:37:43+00:00 |      18 |       2 |         18 |
| 320 | jsnark/akosba                           |  2565 | 2022-12-17 22:57:29+00:00 |     192 |      85 |        192 |
| 916 | zokrates/zokrates                       |  2425 | 2023-09-19 09:20:24+00:00 |    1632 |     335 |       1632 |
| 344 | libsnark/lattice-based-zksnarks         |  2349 | 2020-11-06 20:37:38+00:00 |       1 |       1 |          1 |
| 901 | zokrates/scrypt-inc                     |  2269 | 2023-04-16 16:53:26+00:00 |      14 |       1 |         14 |
| 477 | protocols/loopring                      |  2258 | 2023-08-29 01:45:54+00:00 |     288 |      83 |        288 |
| 374 | mina/minaprotocol                       |  2135 | 2023-10-23 11:17:58+00:00 |    1799 |     457 |       1799 |
| 437 | pequin/pepper-project                   |  2125 | 2022-04-05 22:32:30+00:00 |     119 |      46 |        119 |
| 803 | zk-swap-libsnark/eyblockchain           |  2003 | 2019-11-27 15:24:32+00:00 |       1 |       1 |          1 |
|  27 | ark-bulletproofs/findoranetwork         |  1925 | 2023-05-12 20:49:46+00:00 |       5 |       1 |          5 |
| 402 | noah/findoranetwork                     |  1729 | 2023-08-03 03:58:30+00:00 |      85 |      26 |         85 |
| 739 | xjsnark/akosba                          |  1655 | 2022-11-27 07:04:51+00:00 |     173 |      31 |        173 |
| 759 | zeth/clearmatics                        |  1651 | 2022-12-07 18:05:49+00:00 |      52 |      23 |         52 |
| 337 | lattice-snarg/dwu4                      |  1646 | 2022-04-14 17:04:03+00:00 |       7 |       2 |          7 |
| 531 | semaphore/semaphore-protocol            |  1623 | 2023-10-18 12:11:30+00:00 |     756 |     130 |        756 |
|   6 | abpr22/baghery                          |  1599 | 2023-08-19 07:50:49+00:00 |       4 |       2 |          4 |
| 134 | celo-bls-snark-rs/celo-org              |  1576 | 2023-09-17 08:23:32+00:00 |      82 |      24 |         82 |
| 558 | snarkos/aleohq                          |  1532 | 2023-10-22 22:04:23+00:00 |    2714 |    1901 |       2714 |
| 346 | libsnark_gadgetlib3/matter-labs-archive |  1511 | 2018-07-23 18:40:46+00:00 |       4 |       2 |          4 |


It makes sense that applications with higher lifespans to be popular and active projects. 

This holds for some applications, such as `mina/minaprotocol`, `snarkos/aleohq`, `protocols/loopring` and `semaphore/semaphore-protocol`. 

Not all active application with high lifespans are popular, as seen in `bellpepper/lurk-lab`, `abpr22/baghery`, `ark-bulletproofs/findoranetwork` or `zokrates/scrypt-inc`. 

Some applications have a high lifespan, but are not active (have not been updated in 2023). For instance, `bitcoinprivate-legacy/btcprivate`. This repo was popular (it is a Bitcoin and Zclassic fork which are some of the earlier projects of the space), but is no longer being maintained. It might be because the project was moved to a elsewhere, as [this article](https://bitcoinprivate.medium.com/the-dawn-of-a-new-age-for-bitcoin-private-5e34d8497f6) explains and as the README states, *"10/8/2020 - Current chain offline - New code and chain coming soon"*.


In [None]:
lifespan_commits = zkp_applications.merge(commit_counts[['UniqueID', 'CommitCount']], left_on='UniqueID', right_on='UniqueID', how='left')

fig = px.scatter(lifespan_commits,  
                x='Age', 
                y='CommitCount',
                # color='UniqueID',
                hover_name='UniqueID',
                title='Relationship Between Application Lifespan and No. of Commits',
                template='plotly_dark', 
                trendline='ols',
                # color_discrete_sequence=['pink']
                )

fig.show()

An outlier observed in the graph above is `mina/minaprotocol` which is due to the application's extremely high CommitCount of 27404 (the second highest commit count is 11051). 

Mina's high *CommitCount* could be as a result of multiple reasons, such as the Continuous Integration/Continuous Deployment system of the application (repositories implementing CI/CD practices often have a higher commit count due to automated testing, building, and deployment processes triggering commits) or the project's version control standards (repositories following version control best practices, such as making small, atomic commits, could accumulate a higher commit count). Looking at the commits, there are many which are as a result of merging different PRs from different branches. Mina follows a fork-and-pull development strategy which gives anyone access to contribute. 

Another outlier, mostly due to the *Age* axis, is `bitcoinprivate-legacy/btcprivate`, which is the application with the highest lifespan.

There is weak correlation (R^2 = 0.201) between the Age (lifespan) of an application and its CommitCount.

In [None]:
authorcount_commitcount = zkp_applications.merge(application_committers, left_on='UniqueID', right_on='UniqueID', how='left')
authorcount_commitcount = authorcount_commitcount.merge(commit_counts[['UniqueID', 'CommitCount']], left_on='UniqueID', right_on='UniqueID', how='left')

fig = px.scatter(authorcount_commitcount,  
                x='AuthorCount', 
                y='CommitCount',
                # color='UniqueID',
                hover_name='UniqueID',
                title='Relationship Between Application Author Count and No. of Commits',
                template='plotly_dark', 
                trendline='ols',
                )

fig.show()

The outliers with AuthorCount > 100 are `bitcoinprivate-legacy/btcprivate`, `mina/minaprotocol` and `starknet-ecosystem.com/419labs`.

- `bitcoinprivate-legacy/btcprivate`: has the highest author count and a high commit count. This may be because it is an open-source project with popularity and encouragement to contribute. The README states, *"Code review is welcome!"*. 
- `starknet-ecosystem.com/419labs`: has a high author count and a commit count that is within normal range. This repo seems to follow a Community Driven development model, where contributions are encouraged, as indicated in their README. Contributors are not limited to a fixed/regulated set of users. 
- `mina/minaprotocol`: has the highest commit count and high author count. Mina seems to follow the Fork and Pull Request Model development model. As indicated in their [Contribution](https://github.com/MinaProtocol/mina/blob/develop/CONTRIBUTING.md) docs, anyone can clone the repo, fork and implement code changes. Developers have freedom to contribute to this application, which may account for the high author count. 

There is moderate correlation (R^2 = 0.337) between the AuthorCount and CommitCount of an application.

#### Application Author Count and Application Age

In [None]:
author_lifespan = zkp_applications.merge(application_committers, left_on='UniqueID', right_on='UniqueID', how='left')

fig = px.scatter(author_lifespan,  
                x='AuthorCount', 
                y='Age',
                # color='UniqueID',
                hover_name='UniqueID',
                title='Relationship Between Application Author Count and Application Age',
                template='plotly_dark', 
                trendline='ols'
                )

fig.show()

There is a weak to moderate correlation (R^2 = 0.297) between the AuthorCount and Age of an application. 

Outliers include: 

- `bitcoinprivate-legacy/btcprivate`: has the highest author count and highest age
- `starknet-ecosystem.com/419labs`: has a high author count and an age that is within normal range
- `mina/minaprotocol`: has a high author count and moderate to high age

These repos were outliers discussed after the previous graph. 


#### Application Lifespan & No. of Stars

In [None]:

fig = px.scatter(zkp_applications,  
                x='Age', 
                y='Stars',
                # color='UniqueID',
                hover_name='UniqueID',
                title='Relationship Between Application Lifespan and No. of Stars',
                template='plotly_dark', 
                trendline='ols'
                )

fig.show()


From the graph above, it can be observed that there is a weak correlation (R^2 = 0.012) between the No. of Stars and Lifespan of an application. 

`sismo-badges` has a low age but a high amount of stars. Although the repository has a young age, it seems popular based on the #stars, #forks, #watchers and its Twitter account. 

#### Application Lifespan & No. of Forks

In [None]:

fig = px.scatter(zkp_applications,  
                x='Age', 
                y='Forks',
                # color='UniqueID',
                hover_name='UniqueID',
                title='Relationship Between Application Lifespan and No. of Forks',
                template='plotly_dark', 
                trendline='ols'
                )

fig.show()


From the graph above, it can be observed that there is a weak correlation (R^2 = 0.074) between the No. of Forks and Lifespan of an application. 

#### Application Start Date

In [None]:
zkp_repos['Created'] = pd.to_datetime(zkp_repos['Created'], utc=True)
zkp_repos['MonthYear'] = zkp_repos['Created'].dt.to_period('M')

zkp_tools = zkp_repos[zkp_repos['Type'] == 'Tool']
tool_monthly_counts = zkp_tools.groupby('MonthYear').size().reset_index()
tool_monthly_counts.rename(columns={0: 'Count'}, inplace=True)
tool_monthly_counts['MonthYear'] = tool_monthly_counts['MonthYear'].astype(str)

# zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']
start_date_df = zkp_repos[zkp_repos['Type'] == 'Application']
app_monthly_counts = start_date_df.groupby('MonthYear').size().reset_index()
app_monthly_counts.rename(columns={0: 'Count'}, inplace=True)
app_monthly_counts['MonthYear'] = app_monthly_counts['MonthYear'].astype(str)

fig = px.bar(app_monthly_counts,  
            x='MonthYear',
            y='Count',
            title='Start Date of Application Repositories',
            template='plotly_dark', 
            )

fig.show()

fig = px.bar(tool_monthly_counts,  
            x='MonthYear',
            y='Count',
            title='Start Date of Tool Repositories',
            template='plotly_dark', 
            )

fig.show()

google_analytics = pd.read_csv('zkp_google_search.csv', sep=';')
google_analytics['Month'] = google_analytics['Month'].astype(str)
google_analytics.rename(columns={'zero knowledge proof: (Worldwide)': 'Count'}, inplace=True)

fig = px.line(google_analytics,  
            x='Month',
            y='Count',
            title='Google Search Term Popularity Metrics for term="zero knowledge proof"',
            template='plotly_dark', 
            )

fig.show()


Converting to PeriodArray/Index representation will drop timezone information.



From the first graph, it is clear that there was an increase in applications created over time - the number of applications picking up from ~2019.

This may be due to the rise in popularity of ZKPs, especially in the blockchain space, following their use in popular projects such as Ethereum. 

A similar pattern can be seen in the start dates of the ZKP Tool repositories with an increase from 2018 onwards. 

For tool repositories, the counts tend to stay within the same range from the years 2018 - 2023, with higher counts in Sep 2020 and Oct 2020. 


Comparing the application counts to the Google Search Term popularity, there is a similar trend in uptake from the end of 2021 onwards. 


(**Google Analytics**: Numbers represent search interest relative to the highest point on the chart for the given region and time. A value of 100 is the peak popularity for the term. A value of 50 means that the term is half as popular. A score of 0 means there was not enough data for this term.)

### Active

#### Number of Active Applications

In [None]:
application_commits['AuthorDate'] = pd.to_datetime(application_commits['AuthorDate'])
active_applications = application_commits[application_commits['AuthorDate'].dt.year == 2023]
active_applications = active_applications['UniqueID'].unique()

inactive_applications = application_commits['UniqueID'].unique()
inactive_applications =  set(inactive_applications) - set(active_applications)

apps = {
    'Inactive Applications': [len(inactive_applications)],
    'Active Applications': [len(active_applications)]
}

active_vs_inactive = pd.DataFrame(apps)
active_vs_inactive = active_vs_inactive.melt(var_name='Column', value_name='Length')


fig = px.pie(active_vs_inactive, 
             names='Column', 
             values='Length', 
             title='Active vs. Inactive Applications',
             template='plotly_dark',
             )
fig.show()

The divide between active and inactive applications is near to equal. **Why**?

#### Number of Active Authors

In [None]:
application_commits['AuthorDate'] = pd.to_datetime(application_commits['AuthorDate'], utc=True)
active_authors = application_commits[application_commits['AuthorDate'].dt.year == 2023]
active_authors = active_authors.groupby('UniqueID')['Author'].nunique().reset_index()
active_authors.rename(columns={'Author': 'ActiveAuthorCount'}, inplace=True)
active_authors.sort_values(by='ActiveAuthorCount', inplace=True)
active_authors = active_authors.merge(zkp_applications[['UniqueID']], left_on='UniqueID', right_on='UniqueID', how='right')
active_authors = active_authors.fillna(0)
active_author_bins = np.logspace(np.log10(active_authors[active_authors['ActiveAuthorCount'] > 0]['ActiveAuthorCount'].min()-0.5), np.log10(active_authors['ActiveAuthorCount'].max()), num=9)
active_author_bins = np.round(active_author_bins)
active_author_bins = np.insert(active_author_bins, 0, -np.inf)
active_authors['ActiveAuthorCountGroup'] =  pd.cut(active_authors['ActiveAuthorCount'], bins=active_author_bins, duplicates='drop')
active_author_bin_counts = active_authors['ActiveAuthorCountGroup'].value_counts().sort_index().reset_index()
active_author_bin_counts.rename(columns={'count': 'Count'}, inplace=True)
active_author_bin_counts['ActiveAuthorCountGroup'] = active_author_bin_counts['ActiveAuthorCountGroup'].astype(str)


fig = px.bar(active_author_bin_counts,  
            x='ActiveAuthorCountGroup',
            y='Count',
            title='No. of Active Authors by Bins',
            template='plotly_dark', 
            )

fig.show()

fig = px.bar(bin_counts,  
             x='AuthorGroup', 
             y='Count',
             title='Application Author Count Distribution per Bin',
             template='plotly_dark', 
             )

fig.show()


The largest bin is (0-inf,0] which indicates that a large amount of repositories (452) have no active authors. The second largest bin is (0,1] with count 164. Fewer repositories repositories have large active author counts. 

By comparing the ActiveAuthorCounts the total AuthorCount (all authors, active and inactive), seen previously, a decrease in the count range can be seen. 

This could indicate that not all authors of a repository are active. 

##### No. of Active Authors Outliers

In [None]:
z_scores = stats.zscore(active_authors['ActiveAuthorCount'])
outliers = (np.abs(z_scores) > 3)
active_authors['Outliers'] = outliers
active_authors[active_authors['Outliers'] == True].sort_values(by='ActiveAuthorCount', ascending=False)


Unnamed: 0,UniqueID,ActiveAuthorCount,ActiveAuthorCountGroup,Outliers
756,cairo/starkware-libs,75.0,"(40.0, 75.0]",True
786,starknet-ecosystem.com/419labs,72.0,"(40.0, 75.0]",True
606,madara/keep-starknet-strange,69.0,"(40.0, 75.0]",True
764,starknetbook/starknet-edu,66.0,"(40.0, 75.0]",True
620,lambdaworks/lambdaclass,45.0,"(40.0, 75.0]",True
874,noir/noir-lang,43.0,"(40.0, 75.0]",True
766,cairo-vm/lambdaclass,43.0,"(40.0, 75.0]",True
633,dojo/dojoengine,42.0,"(40.0, 75.0]",True
909,mina/minaprotocol,41.0,"(40.0, 75.0]",True
836,zkevm-circuits/kroma-network,38.0,"(21.0, 40.0]",True


In [None]:
active_authors.sort_values(by='ActiveAuthorCount', inplace=True)

fig = px.scatter(active_authors,  
            x='UniqueID',
            y='ActiveAuthorCount',
            title='No. of Active Authors',
            template='plotly_dark', 
            height=1000
            )

fig.show()


A similar trend is seem in the graph when looking at higher AuthorCounts as seen in the total AuthorCounts previously. 

- `cairo` has the highest number of active authors with a count of 75. This is very close to its total author count of 79.
- `btcprivate-legacy` has the highest number of total AuthorCount as seen previously, however, it has no active authors
- `mina/minaprotocol` had one of the previously high AuthorCounts of 108, but has only 41 active authors
- `starknet-ecosystem.com/419labs` had one of the previously high AuthorCounts of 141, now has  72 active authors (the 2nd highest ActiveAuthorCount)
- `madara/keep-starknet-strange` has the highest ActiveAuthorCount with 69 active authors (all authors are active)
- `starknetbook/starknet-edu`	has the 4th highest ActiveAuthorCount with 66 active authors (with a AuthorCount of 70)


#### Ratio Between Total Authors and Active Authors

In [None]:
active_comparison = application_committers[['UniqueID', 'AuthorCount']].merge(active_authors[['UniqueID', 'ActiveAuthorCount']], left_on='UniqueID', right_on='UniqueID', how='left')
active_comparison['ActiveInActive'] = active_comparison['ActiveAuthorCount'] / active_comparison['AuthorCount']
active_comparison = active_comparison.fillna(0)
active_comparison.sort_values(by='ActiveInActive', inplace=True)
bins = np.linspace(0, 1, 11)
bins = np.insert(bins, 0, -np.inf)
active_comparison['ActiveInActiveGroup'] =  pd.cut(active_comparison['ActiveInActive'], bins=bins)
active_inactive_bin_counts = active_comparison['ActiveInActiveGroup'].value_counts().sort_index().reset_index()
active_inactive_bin_counts.rename(columns={'count': 'Count'}, inplace=True)
active_inactive_bin_counts['ActiveInActiveGroup'] = active_inactive_bin_counts['ActiveInActiveGroup'].astype(str)

fig = px.bar(active_inactive_bin_counts,  
            x='ActiveInActiveGroup',
            y='Count',
            title='Graph Showing the Ratio of Active Authors to Total Author for a Repository by Bins',
            template='plotly_dark', 
            )

fig.show() 


The largest bin is (-inf, 0) which indicates that 450 repositories have 0 active authors, ensuring the the ActiveAuthors to TotalAuthors ratio is 0. 

Interestingly, the second largest bin is (0.9, 1) with count 270. This bin indicates that most authors in these repositories are active authors. 

Low ActiveAuthor/Authors ratios could indicate repositories which are inactive (especially where it is 0). 

High ActiveAuthor/Authors ratios could indicate repositories which are new and/or popular. 

### Author Company

#### Get Companies and Author Count for All Applications

In [None]:
application_authors = application_authors.drop_duplicates(['Login'])
authors = application_authors[application_authors['Type'] == 'User']
author_commits = application_commits.merge(authors, left_on='Author', right_on='Login', how='left')
author_commits = author_commits.loc[:, ~author_commits.columns.str.contains('^Unnamed')]
author_commits.drop_duplicates(inplace=True)
author_commits.dropna(subset=['Login'], inplace=True)
authors['Company'] = authors['Company'].str.strip()
authors['Company'] = authors['Company'].str.lower()
authors['Company'] = authors['Company'].str.split(',')

authors_exp = authors.explode('Company')
authors_exp['Company'] = authors_exp['Company'].str.replace('@', '')
authors_exp = authors_exp.drop_duplicates(subset=['Login', 'Company']) 

company_counts = authors_exp['Company'].value_counts().reset_index()
company_counts.rename(columns={'count': 'Count'}, inplace=True)

single_occurrence_companies = company_counts[company_counts['Count'] == 1]['Company'].to_list()
matching_companies = company_counts[company_counts['Count'] > 1]['Company'].to_list()

for company in single_occurrence_companies:
    matching_company = difflib.get_close_matches(company, matching_companies, n=1)

    if matching_company:
        authors_exp.loc[authors_exp['Company'] == company, 'Company'] = matching_company[0]

company_counts = authors_exp['Company'].value_counts().reset_index()
company_counts.rename(columns={'count': 'Count'}, inplace=True)

fig = px.bar(company_counts[company_counts['Count'] > 1],  
            x='Company',
            y='Count',
            title='Graph showing No. of Authors belonging to companies where count > 1',
            template='plotly_dark', 
            )

fig.show() 

The graph above shows the frequency of companies listed in the GitHub "bio"'s of the authors. 

The companies with a frequency of > 1 are shown to reduce noise and help illustrate commonly mentioned companies.  

Note that some companies are educational institutions, such as 'Peking University'.

In [None]:
authors_exp = authors_exp.drop_duplicates(subset=['Login']) 
authors_exp['IsUniversity'] = authors_exp['Company'].str.contains('university')
university_authors_count = authors_exp['IsUniversity'].sum()
company_authors = authors_exp[authors_exp['Company'].notna()]
company_authors = company_authors[company_authors['IsUniversity'] == False]
company_authors = company_authors['Login'].nunique()
non_company_authors = authors.shape[0] - company_authors

non_vs_company = {
    'Unspecified Authors': [non_company_authors],
    'Company Authors': [company_authors],
    'University Authors': [university_authors_count]
}

non_vs_company = pd.DataFrame(non_vs_company)
non_vs_company = non_vs_company.melt(var_name='AuthorType', value_name='Count')

fig = px.pie(non_vs_company, 
             names='AuthorType', 
             values='Count', 
             title='Graph showing counts of University, Company, and Unspecified Authors',
             template='plotly_dark',
             )
fig.show()

This graph above attempts to illustrate the proportion of authors which are associated with a company, a university or unspecified. 

The majority (68.5%) of authors have not specified a company in their bio. This could mean that they do no work for a company, or they have simply not specified it. 

It is important to note, that some authors listed multiple companies/affiliations in their bios. 

A further exploration was done to see if there are a particular toolset related to users that have a company in their bio, a university in their bio or unspecified. This is shown in the following few graphs. 

#### Tool Used by Company Authors

In [None]:
company_authors = authors_exp[authors_exp['Company'].notna()]
company_authors = company_authors[company_authors['IsUniversity'] == False]
company_authors = application_commits.merge(company_authors, left_on='Author', right_on='Login')

zkp_applications_exp = zkp_applications.explode('Tool')
company_tools = zkp_applications_exp.merge(company_authors, left_on='UniqueID', right_on='UniqueID')
company_tools = company_tools.groupby('Tool')['UniqueID'].nunique().reset_index()
company_tools.rename(columns={'UniqueID': 'ToolCount'}, inplace=True)
company_tools.sort_values(by='ToolCount', inplace=True, ascending=False)

fig = px.bar(company_tools,  
            x='Tool',
            y='ToolCount',
            title='Tools used by Company Applications',
            template='plotly_dark', 
            text='ToolCount'
            )

fig.show() 

These are the ToolCounts related to the repos authored by authors who have a company listed in their bio. 

`arkworks/algebra` is the most commonly occurring tool, closely followed by `arkworks/std` and `arkworks/curves`. The high tool counts for these tools can be attributed to many repos using the Cairo DSL which is built using these tools. [Lambaclass](https://lambdaclass.com/) most commonly occurring company. Lambaclass works on multiple open-source projects, one of them being [Cairo-VM](https://github.com/lambdaclass/cairo-vm). This repository has a relatively high ActiveAuthorCount of  41.

StarkWare is also one of the more commonly occuring companies. StarkWare is the company behind the Cairo DSL. This could attribute to the high use of these tools. 

#### Tools Used by University Authors

In [None]:
university_authors = authors_exp[authors_exp['IsUniversity'] == True]
university_authors = application_commits.merge(university_authors, left_on='Author', right_on='Login')
university_authors = university_authors.drop_duplicates(['UniqueID','Login'])

zkp_applications_exp = zkp_applications.explode('Tool')
university_tools = zkp_applications_exp.merge(university_authors, left_on='UniqueID', right_on='UniqueID')
university_tools = university_tools.groupby('Tool')['UniqueID'].nunique().reset_index()
university_tools.rename(columns={'UniqueID': 'ToolCount'}, inplace=True)
university_tools.sort_values(by='ToolCount', inplace=True, ascending=False)

fig = px.bar(university_tools,  
            x='Tool',
            y='ToolCount',
            title='Tools used by University Applications',
            template='plotly_dark', 
            text='ToolCount'
            )

fig.show() 

A similar breakdown is seem for authors who have a university in their bio. The exception is `libsnark` which is more commonly used under university-associated authors than by company-associated authors. 

#### Tool Usage Across Company & University Authors

In [None]:
all_tools = sorted(zkp_repos[zkp_repos['Type'] == 'Tool']['Name'].to_list())
comp_tools = sorted(company_tools['Tool'].to_list())
uni_tools = sorted(university_tools['Tool'].to_list())
only_company = sorted(list(set(comp_tools) - set(uni_tools)))
not_used = sorted(list(set(all_tools) - set(comp_tools)))

data = dict( AllTools = np.array(all_tools), CompanyTools = np.array(comp_tools), UniversityTools = np.array(uni_tools), CompanyOnly = np.array(only_company), NotUsed = np.array(not_used))
    
tools_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in data.items() ]))
tools_df = tools_df.fillna('')

The table below summarises the tools used in applications by authors that have a company or university in their bio. 

`bulletproofs (sdiehl)` and `openzkp` are not used by either company-associated authors nor university-associated authors. These two tools are not commonly used tools in general. 

Company-associated authors use an additional 10 tools compared to university-associated authors.

|    | AllTools                   | CompanyTools               | UniversityTools            | CompanyOnly            | NotUsed               |
|---:|:---------------------------|:---------------------------|:---------------------------|:-----------------------|:----------------------|
|  0 | arkworks/algebra           | arkworks/gm17              | arkworks/algebra           | arkworks/gm17          | bulletproofs (sdiehl) |
|  1 | arkworks/circom-compat     | arkworks/algebra           | arkworks/crypto-primitives | arkworks/circom-compat | openzkp               |
|  2 | arkworks/crypto-primitives | arkworks/circom-compat     | arkworks/curves            | arkworks/gemini        |                       |
|  3 | arkworks/curves            | arkworks/crypto-primitives | arkworks/gm17              | arkworks/marlin        |                       |
|  4 | arkworks/gemini            | arkworks/curves            | arkworks/groth16           | arkworks/nonnative     |                       |
|  5 | arkworks/gm17              | arkworks/gemini            | arkworks/poly-commit       | plonky                 |                       |
|  6 | arkworks/groth16           | arkworks/gm17              | arkworks/r1cs-std          | plonky2                |                       |
|  7 | arkworks/marlin            | arkworks/groth16           | arkworks/snark             | plonky3                |                       |
|  8 | arkworks/nonnative         | arkworks/marlin            | arkworks/sponge            | zksync                 |                       |
|  9 | arkworks/poly-commit       | arkworks/nonnative         | arkworks/std               |                        |                       |
| 10 | arkworks/r1cs-std          | arkworks/poly-commit       | bellman                    |                        |                       |
| 11 | arkworks/snark             | arkworks/r1cs-std          | bulletproofs               |                        |                       |
| 12 | arkworks/sponge            | arkworks/snark             | cairo-lang                 |                        |                       |
| 13 | arkworks/std               | arkworks/sponge            | circom                     |                        |                       |
| 14 | bellman                    | arkworks/std               | circomlib                  |                        |                       |
| 15 | bulletproofs               | bellman                    | gnark                      |                        |                       |
| 16 | bulletproofs (sdiehl)      | bulletproofs               | halo2                      |                        |                       |
| 17 | cairo-lang                 | cairo-lang                 | leo                        |                        |                       |
| 18 | circom                     | circom                     | libsnark                   |                        |                       |
| 19 | circomlib                  | circomlib                  | merlin                     |                        |                       |
| 20 | gnark                      | gnark                      | miden-vm                   |                        |                       |
| 21 | halo2                      | halo2                      | pysnark                    |                        |                       |
| 22 | leo                        | leo                        | risc0                      |                        |                       |
| 23 | libsnark                   | libsnark                   | snarkjs                    |                        |                       |
| 24 | merlin                     | merlin                     | snarky                     |                        |                       |
| 25 | miden-vm                   | miden-vm                   | winterfell                 |                        |                       |
| 26 | openzkp                    | plonky                     |                            |                        |                       |
| 27 | plonky                     | plonky2                    |                            |                        |                       |
| 28 | plonky2                    | plonky3                    |                            |                        |                       |
| 29 | plonky3                    | pysnark                    |                            |                        |                       |
| 30 | pysnark                    | risc0                      |                            |                        |                       |
| 31 | risc0                      | snarkjs                    |                            |                        |                       |
| 32 | snarkjs                    | snarky                     |                            |                        |                       |
| 33 | snarky                     | winterfell                 |                            |                        |                       |
| 34 | winterfell                 | zksync                     |                            |                        |                       |
| 35 | zksync                     |                            |                            |                        |                       |

### Hireable

#### Hireable vs Non-Hirable Authors

In [None]:
hireable_counts = {
    'Hireable Authors': [len(authors[authors['Hireable'] == True]['Login'].unique())],
    'Non-Hireable Authors': [len(authors[authors['Hireable'].isna()]['Login'].unique())]
}

hire_vs_non = pd.DataFrame(hireable_counts)
hire_vs_non = hire_vs_non.melt(var_name='AuthorGroup', value_name='Amount')

fig = px.pie(hire_vs_non, 
             names='AuthorGroup', 
             values='Amount', 
             title='Hireable vs. Non-Hireable (Unspecified) Authors',
             template='plotly_dark',
             )
fig.show()

The majority of authors are non-hireable authors. The limitation with this metric is that this could mean that authors are (a) truly not available for hire or (b) they have not explicitly specified their Hireable status on GitHub. If Hireable, the flag is set to True for the profile. If not Hireable, the flag is not set (not set to False, which would have been useful for creating a distinction between Hireable, Non-Hireable and Unspecified.)

Assuming that if users *really* were hireable, they would have set the flag and have otherwise left it unchecked. In the case of Non-Hireable Authors, the user's application repository is (a) side-project or (b) the industry project they are working on. 

A high non-hirable could indicate that majority of authors work on side-projects related to ZKPs or work on these repos as part of their job. 




In [None]:
company_hirable = {
    'Hireable Authors': [len(company_authors[company_authors['Hireable'] == True]['Login'].unique())],
    'Non-Hireable Authors': [len(company_authors[company_authors['Hireable'].isna()]['Login'].unique())]
}

company_hirable = pd.DataFrame(company_hirable)
company_hirable = company_hirable.melt(var_name='AuthorGroup', value_name='Amount')

fig = px.pie(company_hirable, 
             names='AuthorGroup', 
             values='Amount', 
             title='Hireable vs. Non-Hireable Count of Authors with a Company in Bio',
             template='plotly_dark',
             )
fig.show()

Out out the company-affiliated authors, 24.1% of them are hireable and 75.9% are unspecified (as mentioned above, they could be unspecified or *actually* not hireable).

#### No. of Hirable Users per Application

In [None]:
hirable_authors = author_commits.drop_duplicates(subset=['UniqueID', 'Author', 'Hireable'])
hirable_authors = hirable_authors.groupby('UniqueID')['Hireable'].sum().astype(int).reset_index()
hirable_authors.rename(columns={'Hireable':'HireableCount'}, inplace=True)
hirable_authors.sort_values(by='HireableCount', inplace=True)

fig = px.scatter(hirable_authors[hirable_authors['HireableCount'] > 0],  
            x='UniqueID',
            y='HireableCount',
            title='Hireable Authors per Application Repository (Where No. of Hireable Authors > 1)',
            template='plotly_dark', 
            )

fig.show() 


`bitcoinprivate-legacy/btcprivate` has the greatest HireableCount of 66.

A high HireableCount for a repository could mean that the repository is a side-project for most authors that contribute. 

In the case of `bitcoinprivate-legacy/btcprivate`, this repository is inactive. This may indicate that at some point the authors may have contributed full-time to the repository but are looking for employment since it changed status to inactive. 

#### AuthorCount & HireableCount

In [None]:
author_counts_hirable = hirable_authors.merge(application_committers[['UniqueID', 'AuthorCount']], left_on='UniqueID', right_on='UniqueID', how='right')
author_counts_hirable = author_counts_hirable.fillna(0)

fig = px.scatter(author_counts_hirable,  
            x='AuthorCount',
            y='HireableCount',
            title='Hireable Authors per Application Repository',
            template='plotly_dark', 
            trendline='ols', 
            hover_name='UniqueID'
            )

fig.show() 



From the graph above,  it can be seen that there is a strong correlation (R^2 = 0.89) between the AuthorCount for a repository and the HireableCount. 

### Ages

In [None]:
authors = application_authors[application_authors['Type'] == 'User']
authors['CreatedAt'] = pd.to_datetime(authors['CreatedAt'], utc=True)
authors['AuthorAge'] = (datetime.now(timezone.utc) - authors['CreatedAt']).dt.days
age_bins = np.logspace(np.log10(authors['AuthorAge'].min()-1), np.log10(authors['AuthorAge'].max()), num=10)
age_bins = np.round(age_bins)
authors['AuthorAgeGroup'] =  pd.cut(authors['AuthorAge'], bins=age_bins)
age_bin_counts = authors['AuthorAgeGroup'].value_counts().sort_index().reset_index()
age_bin_counts.rename(columns={'count': 'Count'}, inplace=True)
age_bin_counts['AuthorAgeGroup'] = age_bin_counts['AuthorAgeGroup'].astype(str)

fig = px.bar(age_bin_counts,  
            x='AuthorAgeGroup',
            y='Count',
            title='Author Age by Bins',
            template='plotly_dark', 
            text='Count'
            )

fig.show()



From the graph above, it can be seen the larger bins tend to contain more Age counts. This shows that the authors contributing to the repositories are older GitHub users (i.e. the user has had their account for some time, such as a couple of years in the case of the bin (2790, 5716] which contains 1333 authors). 

#### Repository Age vs. Author Age

In [None]:
repo_author_age = application_commits.merge(authors, left_on='Author', right_on='Login', how='left')
repo_author_age = repo_author_age.dropna(subset=['Login'])
repo_author_age = repo_author_age.drop_duplicates(subset=['UniqueID', 'Login'])
mean_author_age = repo_author_age.groupby('UniqueID')['AuthorAge'].mean().reset_index()
mean_author_age.rename(columns={'AuthorAge': 'MeanAuthorAge'}, inplace=True)
mean_author_age = zkp_applications.merge(mean_author_age, left_on='UniqueID', right_on='UniqueID', how='left')

fig = px.scatter(mean_author_age,  
            x='MeanAuthorAge',
            y='Age',
            title='Graph showing the MeanAuthorAge and Age of Repository',
            template='plotly_dark', 
            trendline='ols'
            )

fig.show()

From the graph above, it can be seen that there is a weak correlation (R^2 = 0.03) between the MeanAuthorAge of application and the age of the application. 

### Tool Usage by Applications

This section focuses on understanding the types of applications that use a specific tool. 


#### Tool Usage & Application Author Count

In [None]:
# zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']
author_counts = application_commits.groupby('UniqueID')['Author'].nunique().reset_index()
author_counts.rename(columns={'Author': 'AuthorCount'}, inplace=True)
tools_and_authors = zkp_applications.merge(author_counts, left_on='UniqueID', right_on='UniqueID', how='left')
tools_and_authors_exp = tools_and_authors.explode('Tool')
tools_and_authors_exp['Tool'] = tools_and_authors_exp['Tool'].astype(str)
tools_and_authors_exp['Tool'] = tools_and_authors_exp['Tool'].str.replace("‘", '')
tools_by_app_authors = tools_and_authors_exp.groupby('Tool')['AuthorCount'].mean().reset_index()
tools_by_app_authors.rename(columns={'AuthorCount': 'MeanAuthorCount'}, inplace=True)
tools_by_app_authors.sort_values(by='MeanAuthorCount', inplace=True, ascending=False)
tools_by_app_authors

fig = px.scatter(tools_by_app_authors,  
            x='Tool',
            y='MeanAuthorCount',
            title='Average Application AuthorCount per Tool',
            template='plotly_dark'
            )

fig.show()



The graph above shows the mean number of authors for an application using the specific tool. `arkworks/gm17` has the highest MeanAuthorCount being 33.5. This implies that applications using `arkworks/gm17` tend to have a high number of authors.  This may be skewed since only 2 applications use `arkworks/gm17`

Although `arkworks/algebra` it a commonly used tool among many applications, the applications that use `arkworks/algebra` tend to have a fairly low MeanAuthorCount of 7.78. 

#### Tool Usage & Application Commit Count 

In [None]:
# zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']
app_commits = application_commits.groupby('UniqueID')['CommitHash'].nunique().reset_index()
app_commits.rename(columns={'CommitHash': 'CommitCount'}, inplace=True)
tools_and_commits = zkp_applications.merge(app_commits, left_on='UniqueID', right_on='UniqueID', how='left')
tools_and_commits_exp = tools_and_commits.explode('Tool')
tools_by_app_commits = tools_and_commits_exp.groupby('Tool')['CommitCount'].mean().reset_index()
tools_by_app_commits.rename(columns={'CommitCount': 'MeanCommitCount'}, inplace=True)

tools_by_app_commits = zkp_tools[['Name', 'ToolType']].merge(tools_by_app_commits, left_on='Name', right_on='Tool', how='left')
tools_by_app_commits.sort_values(by='MeanCommitCount', ascending=False, inplace=True)

fig = px.scatter(tools_by_app_commits,  
            x='Tool',
            y='MeanCommitCount',
            title='Average Application Commit Count per Tool',
            template='plotly_dark',
            color='ToolType',
            category_orders={'Tool': tools_by_app_commits['Tool']}
            )

fig.show()

The graph above shows the mean number of commits made my applications using the specific tool. 

Here `leo` has the highest MeanCommitCount. `leo` also had a fairly high MeanAuthorCount as shown above. This could indicate that applications using `leo` have a community development approach. 

#### Tool Usage & Application Author Followers

In [None]:
commit_author_info = application_commits.merge(application_authors, left_on='Author', right_on='Login', how='right')
commit_author_info.drop_duplicates(['UniqueID', 'Author'], inplace=True)
avg_followers = commit_author_info.groupby('UniqueID')['Followers'].mean().reset_index()
avg_followers.rename(columns={'Followers': 'MeanFollowers'}, inplace=True)

app_followers = zkp_applications.merge(avg_followers, left_on='UniqueID', right_on='UniqueID', how='left')
app_followers_exp = app_followers.explode('Tool')
tool_app_followers = app_followers_exp.groupby('Tool')['MeanFollowers'].mean().reset_index()

tool_app_followers = zkp_tools[['Name', 'ToolType']].merge(tool_app_followers, left_on='Name', right_on='Tool', how='left')
tool_app_followers.sort_values(by='MeanFollowers', ascending=False, inplace=True)

fig = px.scatter(tool_app_followers,  
            x='Tool',
            y='MeanFollowers',
            title='Average Application Author Follower Count per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': tool_app_followers['Tool']}
            )

fig.show()


The graph above shows the mean followers of authors contributing to applications using the specific tool. 

`bulletproofs (sdiehl)` has the highest MeanFollower count with 1085. This result may be skewed since there is only 1 application using this tool. The application using this tool created by the author of this tool (sdiehl). `bulletproofs (sdiehl)` is a popular tool, hence the author has a fairly large following. The MeanAuthor and MeanCommit counts for this tool is low. 

`snarkjs` has the second highest MeanFollower count with 330.27.

`circom` has a similar MeanFollowr count than `snarky`. These two tools are often used in combination. 


#### Tool Usage & Application Author PublicGists

In [None]:
avg_public_gists = commit_author_info.groupby('UniqueID')['PublicGists'].mean().reset_index()
avg_public_gists.rename(columns={'PublicGists': 'MeanPublicGists'}, inplace=True)

app_public_gists = zkp_applications.merge(avg_public_gists, left_on='UniqueID', right_on='UniqueID', how='left')
app_public_gists_exp = app_public_gists.explode('Tool')
tool_app_gists = app_public_gists_exp.groupby('Tool')['MeanPublicGists'].mean().reset_index()

tool_app_gists = zkp_tools[['Name', 'ToolType']].merge(tool_app_gists, left_on='Name', right_on='Tool', how='left')
tool_app_gists.sort_values(by='MeanPublicGists', ascending=False, inplace=True)

fig = px.scatter(tool_app_gists,  
            x='Tool',
            y='MeanPublicGists',
            title='Average Application Author Public Gist Count per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': tool_app_gists['Tool']}
            )

fig.show()


The graph above shows the mean public gists count for the authors contributing to applications which use a specific tool. 

`bulletproofs (sdiehl)` has the highest MeanPublicGists count and MeanFollower count. 

`winterfell` has the second highest MeanPublicGists count. 

It could be authors with a high follower count tend to have a high public gist count. 


#### Tool Usage & Application Author PublicRepos

In [None]:
avg_public_repos = commit_author_info.groupby('UniqueID')['PublicRepos'].mean().reset_index()
avg_public_repos.rename(columns={'PublicRepos': 'MeanPublicRepos'}, inplace=True)

app_public_repos = zkp_applications.merge(avg_public_repos, left_on='UniqueID', right_on='UniqueID', how='left')
app_public_repos_exp = app_public_repos.explode('Tool')
tool_app_repos = app_public_repos_exp.groupby('Tool')['MeanPublicRepos'].mean().reset_index()

tool_app_repos = zkp_tools[['Name', 'ToolType']].merge(tool_app_repos, left_on='Name', right_on='Tool', how='left')
tool_app_repos.sort_values(by='MeanPublicRepos', ascending=False, inplace=True)

fig = px.scatter(tool_app_repos,  
            x='Tool',
            y='MeanPublicRepos',
            title='Average Application Author Repo Count per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': tool_app_repos['Tool']}
            )

fig.show()




The graph above shows the mean public repository count for authors of the applications that use the specific tool. 

`arkworks/marlin` has the highest MeanPublicRepo count.


#### Tool Usage & Application Author Age

In [None]:
application_authors['CreatedAt'] = pd.to_datetime(application_authors['CreatedAt'],  utc=True)
application_authors['AuthorAge'] = (datetime.now(timezone.utc) - application_authors['CreatedAt']).dt.days

commit_author_info = application_commits.merge(application_authors, left_on='Author', right_on='Login', how='right')
commit_author_info.drop_duplicates(['UniqueID', 'Author'], inplace=True)

avg_age = commit_author_info.groupby('UniqueID')['AuthorAge'].mean().reset_index()
avg_age.rename(columns={'AuthorAge': 'MeanAuthorAge'}, inplace=True)

app_avg_age = zkp_applications.merge(avg_age, left_on='UniqueID', right_on='UniqueID', how='left')
app_avg_age_exp = app_avg_age.explode('Tool')
tool_app_age = app_avg_age_exp.groupby('Tool')['MeanAuthorAge'].mean().reset_index()

tool_app_age = zkp_tools[['Name', 'ToolType']].merge(tool_app_age, left_on='Name', right_on='Tool', how='left')
tool_app_age.sort_values(by='MeanAuthorAge', ascending=False, inplace=True)

fig = px.scatter(tool_app_age,  
            x='Tool',
            y='MeanAuthorAge',
            title='Average Application Author Age per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': tool_app_age['Tool']}
            )

fig.show()

The graph above shows the mean author age for authors contributing to applications that use the specific tool. 

`plonky3` has the highest MeanAuthorAge, indicating that author accounts are "older" GitHub accounts. 

`plonky` has the lowest MeanAuthorAge, indicating that its author accounts are fairly young. 

`arkworks/gm17` and `bulletproofs (sdiehl)` have high mean MeanAuthorAge values - these repos have had high value for previously mentioned analytics too. 

#### Tool Usage & Application Repo Forks


In [None]:
zkp_applications_exp = zkp_applications.explode('Tool')
avg_forks = zkp_applications_exp.groupby('Tool')['Forks'].mean().reset_index()
avg_forks.rename(columns={'Forks': 'MeanForks'}, inplace=True)

avg_forks = zkp_tools[['Name', 'ToolType']].merge(avg_forks, left_on='Name', right_on='Tool', how='left')
avg_forks.sort_values(by='MeanForks', ascending=False, inplace=True)

fig = px.scatter(avg_forks,  
            x='Tool',
            y='MeanForks',
            title='Average Application Forks per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': avg_forks['Tool']}
            )

fig.show()

The graph above shows the mean number of forks of the applications using the specific tool. 

`leo` has the highest MeanForks count at 501.28 which is much larger than the 2nd largest which is 30.45. Three of the applications that use `leo`, `sdk/aleohq`, `snarkos/aleohq` and `snarkvm/aleohq` have fork counts > 1000. These applications are created by the creators of `leo`, `aleohq`, which can explain the popularity and development behind these applications. These applications are part of the `leo` ecosystem. 



#### Tool Usage & Application Repo Stars


In [None]:
avg_stars = zkp_applications_exp.groupby('Tool')['Stars'].mean().reset_index()
avg_stars.rename(columns={'Stars': 'MeanStars'}, inplace=True)

avg_stars = zkp_tools[['Name', 'ToolType']].merge(avg_stars, left_on='Name', right_on='Tool', how='left')
avg_stars.sort_values(by='MeanStars', ascending=False, inplace=True)

fig = px.scatter(avg_stars,  
            x='Tool',
            y='MeanStars',
            title='Average Application Stars per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': avg_stars['Tool']}
            )

fig.show()

The graph above shows the mean number of stars for applications which use a certain tool. 

As with the MeanForks graph, `leo` contains the highest MeanStars value of 516.14, which is much larger than the 2nd MeanStars value of 139. This can be explained by the applications, `aleohq/snarkvm`, `aleohq/snarkos` and `aleohq/sdk` which form part of the `leo` ecosystem. 

#### Tool Usage & Application Repo Age

In [None]:
zkp_applications_exp['Created'] = pd.to_datetime(zkp_applications_exp['Created'], utc=True)
zkp_applications_exp['Updated'] = pd.to_datetime(zkp_applications_exp['Updated'], utc=True)
zkp_applications_exp['Age'] = (zkp_applications_exp['Updated'] - zkp_applications_exp['Created']).dt.days
avg_app_age = zkp_applications_exp.groupby('Tool')['Age'].mean().reset_index()
avg_app_age.rename(columns={'Age': 'MeanApplicationAge'}, inplace=True)

avg_app_age = zkp_tools[['Name', 'ToolType']].merge(avg_app_age, left_on='Name', right_on='Tool', how='left')
avg_app_age.sort_values(by='MeanApplicationAge', ascending=False, inplace=True)

fig = px.scatter(avg_app_age,  
            x='Tool',
            y='MeanApplicationAge',
            title='Average Application Age per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': avg_app_age['Tool']}
            )

fig.show()


The graph above shows the mean application age of the applications using a specific tool. 

`libsnark` has the highest MeanApplicationAge of 1503. `libsnark` is the oldest tools and for a while it was the only ZKP tool which can account for the high MeanApplicationAge. 

#### Tool Usage & Application Repo Start Date

In [None]:
zkp_applications_exp['Created'] = pd.to_datetime(zkp_applications_exp['Created'], utc=True)
avg_app_age = zkp_applications_exp.groupby('Tool')['Created'].mean().reset_index()
avg_app_age.rename(columns={'Created': 'MeanStartDate'}, inplace=True)

avg_app_age = zkp_tools[['Name', 'ToolType']].merge(avg_app_age, left_on='Name', right_on='Tool', how='left')
avg_app_age.sort_values(by='MeanStartDate', ascending=True, inplace=True)

fig = px.scatter(avg_app_age,  
            x='Tool',
            y='MeanStartDate',
            title='Average Application Age per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': avg_app_age['Tool']}
            )

fig.show()

The graph above shows the MeanStartDate of applications using a specific tool. 

`libsnark` has the earliest MeanStartDate. As discussed previously, `libsnark` is the oldest tool, and so it accounts for having the oldest applications. 

On the other side, `plonky3` has the latest MeanStartDate. `plonky3` is the youngest tool. 


#### Tool Usage & Application Hireable Count

In [None]:
hireable_authors = application_commits.merge(application_authors, left_on='Author', right_on='Login', how='left')
hireable_authors.drop_duplicates(['UniqueID', 'Author'], inplace=True)
hireable_authors = hireable_authors.groupby('UniqueID')['Hireable'].count().reset_index()
hireable_authors.rename(columns={'Hireable': 'HireableCount'}, inplace=True)
hireable_authors.sort_values(by='HireableCount', ascending=False, inplace=True)
hireable_authors = hireable_authors.merge(zkp_applications_exp, left_on='UniqueID', right_on='UniqueID', how='left')
hireable_authors = hireable_authors.groupby('Tool')['HireableCount'].mean().reset_index()
hireable_authors.sort_values(by='HireableCount', inplace=True, ascending=False)
hireable_authors.rename(columns={'HireableCount': 'MeanHireableCount'}, inplace=True)

hireable_authors = zkp_tools[['Name', 'ToolType']].merge(hireable_authors, left_on='Name', right_on='Tool', how='left')
hireable_authors.sort_values(by='MeanHireableCount', ascending=False, inplace=True)

fig = px.scatter(hireable_authors,  
            x='Tool',
            y='MeanHireableCount',
            title='Average Application Hireable Authors Count per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': hireable_authors['Tool']}
            )

fig.show()

The graph above shows the mean hirable authors count of applications using a specific tool. 

`arkworks/marlin` has the highest MeanHireableCount. This could indicate the most applications using this tool are side-projects, or made by users aside from their industry projects.

#### Tool Usage & Active Applications

In [None]:
zkp_applications_exp['Updated'] = pd.to_datetime(zkp_applications_exp['Updated'], utc=True)
zkp_applications_exp['Active'] = zkp_applications_exp['Updated'].dt.year == 2023
active_apps = zkp_applications_exp.groupby('Tool')['Active'].count().reset_index()
active_apps.rename(columns={'Active': 'ActiveAppCount'}, inplace=True)
active_apps.sort_values(by='ActiveAppCount', inplace=True, ascending=False)

active_apps = zkp_tools[['Name', 'ToolType']].merge(active_apps, left_on='Name', right_on='Tool', how='left')
active_apps.sort_values(by='ActiveAppCount', ascending=False, inplace=True)

fig = px.scatter(active_apps,  
            x='Tool',
            y='ActiveAppCount',
            title='Average Active Application Count per Tool',
            template='plotly_dark', 
            color='ToolType',
            category_orders={'Tool': active_apps['Tool']}
            )

fig.show()

The graph above shows the mean amount of active applications that use a specific tool. 

`arkworks/algebra` and `arkworks/std` have the largest MeanActiveAppCount of 595 and 588. These are the two most commonly used tool, often use in combination.

`circom` and `snarkjs` both had high MeanActiveAppCount values. These two tools are often used in combination since they are created by the same team and are used in an ecosystem to write circuits (`circom`) and generate proofs (`snarkjs`).


#### Author Commit Count

In [None]:
app_authors =  application_commits.merge(application_authors, left_on='Author', right_on='Login', how='left')
author_commit_count = app_authors.groupby(['UniqueID', 'Author'])['CommitHash'].count().reset_index()
author_commit_count = author_commit_count.groupby('UniqueID')['CommitHash'].mean().reset_index()
author_commit_count.rename(columns={'CommitHash': 'MeanAuthorCommits'}, inplace=True)

mean_commit_bins = np.logspace(np.log10(author_commit_count['MeanAuthorCommits'].min()-0.5), np.log10(author_commit_count['MeanAuthorCommits'].max()), num=10)
mean_commit_bins = np.round(mean_commit_bins)
author_commit_count['CommitCountGroup'] =  pd.cut(author_commit_count['MeanAuthorCommits'], bins=mean_commit_bins, duplicates='drop')
author_commit_count[author_commit_count['CommitCountGroup'].isna()]

commit_bin_count = author_commit_count.groupby('CommitCountGroup')['UniqueID'].count().reset_index()
commit_bin_count.rename(columns={'UniqueID':'Count'}, inplace=True)
commit_bin_count['CommitCountGroup'] = commit_bin_count['CommitCountGroup'].astype(str)


fig = px.bar(commit_bin_count,  
            x='CommitCountGroup',
            y='Count',
            title='Average Commits Count per Author by bins',
            template='plotly_dark', 
            )

fig.show()

The graph above shows the mean commits made by the authors of the applications. 

The four largest bins range around the middle values. These bins span the mean commit count from 6-64 commits. These are relatively few commits, since the largest mean commit count is 732. 

High  mean commit counts are rare. 

#### Languages Used By Applications

In [None]:
fig = px.bar(zkp_applications['Language'].value_counts().reset_index(),  
            x='count', 
            y='Language',
            title='Frequency of Languages used by Applications',
            template='plotly_dark', 
            height=1000
            )

fig.show()

As seen above, Rust is the most commonly uses language, followed by Cairo and TypeScript which have similar counts. 

#### Languages Used By Tools

In [None]:
fig = px.bar(zkp_tools['Language'].value_counts().reset_index(),  
            x='count', 
            y='Language',
            title='Frequency of Languages used by Applications',
            template='plotly_dark', 
            )

fig.show()

Rust is the most commonly used language among tools, followed by Python, JavaScript and C++. As seen previously, Rust is also most commonly used among the applications.

##### Cluster Applications Based on Repository Development Features

In [None]:
# REMOVE SINGLE AUTHORS 
development_features = apps[apps['AuthorCount'] > 1]

cols = [
        # 'Size', 
        'Age',
        # 'ActiveAuthorCount',
        # 'Active',
        'AuthorCount',	
        'CommitCount',	

        # 'Stars',
        # 'Forks', 
        # 'Watchers', 
        # 'Issues',

]


nr_components = 2
X = development_features[cols]
scaler = StandardScaler()
df_standardized = scaler.fit_transform(X)
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_standardized)
kmeans = KMeans(n_clusters=3, random_state=42)
development_features['Cluster'] = kmeans.fit_predict(df_standardized)
components = pca.fit_transform(X)
pca.explained_variance_ratio_





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



array([0.9315144 , 0.06840354])

In [None]:
pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=cols)

Unnamed: 0,PC1,PC2
Age,0.146717,0.98909
AuthorCount,0.006879,0.012321
CommitCount,0.989155,-0.146793


In [None]:
fig = px.scatter(
    development_features,
    x=df_pca[:, 0],
    y=df_pca[:, 1],
    color='Cluster', 
    hover_name='UniqueID',
    labels={'color': 'Cluster'},
    title='PCA Scatter Plot with Clusters',
    template='plotly_dark',
    color_continuous_scale='rainbow'
)

fig.show()

In [None]:
features_to_plot = cols

for feature in features_to_plot:
    fig = px.box(development_features, 
                 x='Cluster', 
                 y=feature, 
                 points='all', 
                 title=f'Distribution of {feature} by Cluster',
                 template='plotly_dark',
                 hover_name='UniqueID'
                 )
    fig.show()

In [None]:
popularity_features = pd.get_dummies(apps, columns=['Language'], prefix='Language')

cols = [
        # 'UniqueID'
        # 'Tool', 
        
        # 'Language', 

        'Stars',
        'Forks', 
        'Watchers', 
        'Issues', 

        # 'FirstCommitDate', 
        # 'LastCommitDate',


        # 'Size', 
        # 'Age',
        # 'ActiveAuthorCount',
        # 'Active',
        'AuthorCount',	
        # 'CommitCount',	

        # 'Language_Assembly', 
        # 'Language_C', 
        # 'Language_C++', 
        # 'Language_CMake',
        # 'Language_CSS', 
        # 'Language_Cairo', 
        # 'Language_Circom', 
        # 'Language_Cuda',
        # 'Language_Dart', 
        # 'Language_Go', 
        # 'Language_HTML', 
        # 'Language_Haskell',
        # 'Language_Java', 
        # 'Language_JavaScript', 
        # 'Language_JetBrains MPS',
        # 'Language_Jupyter Notebook', 
        # 'Language_Kotlin', 
        # 'Language_Lean',
        # 'Language_Makefile', 
        # 'Language_OCaml', 
        # 'Language_Python',
        # 'Language_Racket', 
        # 'Language_Roff', 
        # 'Language_Rust', 
        # 'Language_Shell',
        # 'Language_Solidity', 
        # 'Language_Swift', 
        # 'Language_TypeScript',
        # 'Language_Vim script', 
        # 'Language_Vue'     

]


nr_components = 2
X = popularity_features[cols]
scaler = StandardScaler()
df_standardized = scaler.fit_transform(X)
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_standardized)
kmeans = KMeans(n_clusters=3, random_state=42)
popularity_features['Cluster'] = kmeans.fit_predict(df_standardized)
components = pca.fit_transform(X)
pca.explained_variance_ratio_





array([0.98546014, 0.00983094])

In [None]:
pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=cols)


Unnamed: 0,PC1,PC2
Stars,0.706079,-0.03691
Forks,0.053374,0.949744
Watchers,0.70604,-0.039476
Issues,0.010224,0.299933
AuthorCount,0.002451,0.07145


In [None]:
fig = px.scatter(
    popularity_features,
    x=df_pca[:, 0],
    y=df_pca[:, 1],
    color='Cluster', 
    hover_name='UniqueID',
    labels={'color': 'Cluster'},
    title='PCA Scatter Plot with Clusters',
    template='plotly_dark', 
    color_continuous_scale='rainbow'
)

fig.show()

In [None]:
features_to_plot = cols

for feature in features_to_plot:
    fig = px.box(popularity_features, 
                 x='Cluster', 
                 y=feature, 
                 points='all', 
                 title=f'Distribution of {feature} by Cluster',
                 template='plotly_dark',
                 hover_name='UniqueID'
                 )
    # fig.show()


### Tool Lifespan

#### Tool Lifespans

In [None]:
zkp_tools = zkp_repos[zkp_repos['Type'] == 'Tool'].drop(columns=['MonthYear'])

first_commit = tool_commits.groupby('UniqueID')['AuthorDate'].min().reset_index()
first_commit.rename(columns={'AuthorDate': 'FirstCommitDate'}, inplace=True)
first_commit = zkp_tools.merge(first_commit[['UniqueID', 'FirstCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()
zkp_tools = zkp_tools.merge(first_commit[['UniqueID', 'FirstCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()

last_commit = tool_commits.groupby('UniqueID')['AuthorDate'].max().reset_index()
last_commit.rename(columns={'AuthorDate': 'LastCommitDate'}, inplace=True)
last_commit = zkp_tools.merge(last_commit[['UniqueID', 'LastCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()
zkp_tools = zkp_tools.merge(last_commit[['UniqueID', 'LastCommitDate']], left_on='UniqueID', right_on='UniqueID', how='left').reset_index()

zkp_tools['FirstCommitDate'] = pd.to_datetime(zkp_tools['FirstCommitDate'], utc=True)
zkp_tools['LastCommitDate'] = pd.to_datetime(zkp_tools['LastCommitDate'], utc=True)
zkp_tools['Age'] = (zkp_tools['LastCommitDate'] - zkp_tools['FirstCommitDate']).dt.days
zkp_tools.sort_values(by='Age', ascending=False, inplace=True)

fig = px.bar(zkp_tools,  x='UniqueID', y='Age',
                   title='Tool Lifespan (in days)',
                   template='plotly_dark', 
                )

fig.show()

From the graph above, it can be seen that `bellman` has the longest lifespan. 

#### Tool Start Date

In [None]:
zkp_tools.sort_values(by='FirstCommitDate', inplace=True)
# zkp_tools[zkp_tools['Name'] == 'cairo'] # 2022-05-19
fig = px.scatter(zkp_tools,  x='Name', y='FirstCommitDate',
                   title='Tool Start Date',
                   template='plotly_dark', 
                )

fig.show()

#### Tool End-Date

In [None]:
zkp_tools.sort_values(by='LastCommitDate', inplace=True)

fig = px.scatter(zkp_tools,  x='Name', y='LastCommitDate',
                   title='Last Tool Commit Date',
                   template='plotly_dark', 
                )

fig.show()

### Tool Counts

#### Tool Counts

In [None]:
zkp_tools =  zkp_tools.drop(columns=['Tool'])
tool_counts = zkp_applications_exp['Tool'].value_counts().reset_index()
tool_counts = tool_counts.merge(zkp_tools, left_on='Tool', right_on='Name', how='left')
tool_counts.drop(columns=['Name'], inplace=True)
tool_counts.rename(columns={'count':'Count'}, inplace=True)

fig = px.bar(tool_counts, 
            x='Tool', 
            y='Count',
            template='plotly_dark', 
            title='Distribution of Tool Usage Frequency',
            text='Count',
            color='ToolType',
            category_orders={'Tool': tool_counts['Tool']}
            )
fig.show()



### Tool Type

#### Tool-Type Count

In [None]:
tool_type = zkp_applications_exp[['UniqueID', 'Tool']].merge(zkp_tools[['Name', 'ToolType']], left_on='Tool', right_on='Name', how='left')
type_counts = tool_type['ToolType'].value_counts().reset_index()
type_counts.rename(columns={'count':'Count'}, inplace=True)

fig = px.bar(type_counts, 
            x='ToolType', 
            y='Count',
            template='plotly_dark', 
            title='Distribution of ToolType Usage Frequency',
            text='Count', 
            )
fig.show()

As seen above, low-level zkp development appears to be the most common tool type. Many applications use the DSL Cairo, which is built using these low-level tools. 

#### Tool Lifespan & Tool Count

In [None]:
tool_counts.sort_values(by='Count', inplace=True)

fig = px.scatter(tool_counts, 
            x='Count', 
            y='Age',
            template='plotly_dark', 
            title='Relationship Between Tool Count & Tool Age',
            trendline='ols',
            hover_name='Tool'
            )
fig.show()

There is a weak correlation (R^2 = 0.042) between the Lifespan and tool usage. 

### Cluster Tools

#### Cluster Tool Features

In [None]:
tools = zkp_tools[['Name','Description', 'URL', 'Tool Resources','Size', 'Age', 'Language', 'Stars', 'Forks', 'Watchers', 'Issues', 'Created', 'Updated', 'UniqueID' ]].merge(hireable_authors, left_on='Name', right_on='Name', how='left')
tools = tools.merge(tools_by_app_authors, left_on='Tool', right_on='Tool', how='left')
tools = tools.merge(tools_by_app_commits.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(tool_app_followers.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(tool_app_gists.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(tool_app_repos.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(tool_app_age.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(avg_forks.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(avg_stars.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(avg_app_age.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
tools = tools.merge(active_apps.drop(columns=['ToolType', 'Tool']), left_on='Name', right_on='Name', how='left')
last_commit = tool_commits.groupby('UniqueID')['CommitterDate'].max().reset_index()
tools = tools.merge(last_commit, left_on='UniqueID', right_on='UniqueID', how='left')
tools['CommitterDate'] = pd.to_datetime(tools['CommitterDate'], utc=True)
tools['Active'] = tools['CommitterDate'].dt.year == 2023
tools['Active'] = tools['Active'].astype(int)

In [None]:
tool_commits['CommitterDate'] = pd.to_datetime(tool_commits['CommitterDate'], utc=True)
active_tool_commits = tool_commits[tool_commits['CommitterDate'].dt.year == 2023]
active_tool_commits = active_tool_commits.merge(repo_contributors[['Contributor']], left_on='Author', right_on='Contributor', how='left')
active_tool_commits = active_tool_commits.dropna(subset=['Contributor'])
active_tool_commits = active_tool_commits.groupby('UniqueID')['Contributor'].nunique().reset_index()
active_tool_commits.rename(columns={'Contributor': 'ActiveContributors'}, inplace=True)
tools = tools.merge(active_tool_commits, left_on='UniqueID', right_on='UniqueID', how='left')
tools['ActiveContributors'] = tools['ActiveContributors'].fillna(0)


In [None]:
tool_features = pd.get_dummies(tools, columns=['ToolType'], prefix='ToolType')

cols = [
    # 'Size',
    'Age',
    'Stars', 
    'Forks', 
    'Watchers', 
    'Issues', 
    'ActiveAppCount',
    'Active',
    'ActiveContributors',
    

    # 'MeanHireableCount', 
    # 'MeanAuthorCount',
    'MeanCommitCount', 
    'MeanFollowers', 
    # 'MeanPublicGists',
    # 'MeanPublicRepos', 
    'MeanAuthorAge', 
    'MeanForks', 
    'MeanStars',



    'ToolType_DSL',
    'ToolType_Library', 
    'ToolType_Low-level ZK Development',
    'ToolType_Proof System', 
    'ToolType_zkEVM', 
    'ToolType_zkVM' 

]


nr_components = 2
X = tool_features[cols]
scaler = StandardScaler()
df_standardized = scaler.fit_transform(X)
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_standardized)
kmeans = KMeans(n_clusters=3, random_state=42)
tool_features['Cluster'] = kmeans.fit_predict(df_standardized)
components = pca.fit_transform(X)
pca.explained_variance_ratio_





array([0.38822358, 0.22749925])

In [None]:
pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=cols)

Unnamed: 0,PC1,PC2
Age,0.372241,-0.290119
Stars,0.754343,0.005849
Forks,0.407459,0.082609
Watchers,0.306369,-0.296022
Issues,0.022797,0.004619
ActiveAppCount,0.003147,0.045122
Active,7.2e-05,-4e-05
ActiveContributors,0.000183,-0.000133
MeanCommitCount,-0.024967,-0.106992
MeanFollowers,-0.004148,-0.142594


In [None]:
fig = px.scatter(
    tool_features,
    x=df_pca[:, 0],
    y=df_pca[:, 1],
    color='Cluster', 
    hover_name='UniqueID',
    labels={'color': 'Cluster'},
    title='PCA Scatter Plot with Clusters',
    template='plotly_dark',
    color_continuous_scale='Rainbow',
)

fig.show()


In [None]:
features_to_plot = cols

for feature in features_to_plot:
    fig = px.box(tool_features, 
                 x='Cluster', 
                 y=feature, 
                 points='all', 
                 title=f'Distribution of {feature} by Cluster',
                 template='plotly_dark',
                 hover_name='UniqueID'
                 )
    fig.show()