In [None]:
# !pip install pydriller
import pandas as pd
import os
from pathlib import Path
import plotly.express as px

TODO:

1. Unpack arworks into their crates as tools
2. Unpack starknet into its packages 

```
{'arkwork',
 'arkworks',
 'artworks',
 'bellman, halo2',
 'nan',
 'none',
 'snarks', 🤷‍♂️
 'starknet'}
```

In [None]:
df = pd.read_csv('./zkp_repos.csv', sep=';')

tools_mappings = {
    'none.':'none',
    '':'none',
    'none..':'none',
    'bellpepper':'bellman',
    'bellperson':'bellman',
    'aleo':'leo',
    'aztec':'noir',
     'dusk plonk': 'merlin',
     'libstark': 'libsnark',
     'miden': 'miden-vm',
     'mina': 'snarky',
     'nova': 'bellman, halo2',
     'risc zero': 'risc0',
     'snarksjs': "snarkjs",
     "zerosync": "cairo",     
     "zokcrates": "zokrates",
    
}

In [None]:
df_tools = df[df['Type'] == 'Tool']
df_apps = df[df['Type'] == 'Application']

In [None]:
tools_used = [s.split(',') for s in list(set([str(t).lower() for t in df_apps['Tool'].unique()]))]
# flatten
tools_used = [item for sublist in tools_used for item in sublist]
# remove whitepace
tools_used = [t.strip() for t in tools_used]
tools_used = list(set(tools_used))

tools_used

# map entries in tools_used to tools_mappings
tools_used = [tools_mappings[t] if t in tools_mappings else t for t in tools_used]
tools_used = list(set(tools_used))
tools_used

In [None]:
tools = df_tools['Name'].str.lower().unique()
tools

In [None]:
set(tools_used) - set(tools)

In [None]:
df_apps[df_apps['Tool'].str.lower() == 'snarks']

In [None]:
def clone_repo(url, dst_dir='./repos'):
    repo_name = url.split('/')[-1].split('.')[0]
    dst_dir = Path(dst_dir) / repo_name
    if not dst_dir.exists():
        dst_dir.mkdir(parents=True, exist_ok=True)
    
    os.system(f'git clone {url} {dst_dir}')

In [None]:
df_tools['URL'].apply(clone_repo)

In [None]:
from pydriller import Repository
repo = Repository("./repos/bellman/")

In [None]:
# hash (str): hash of the commit
# msg (str): commit message
# author (Developer): commit author (name, email)
# committer (Developer): commit committer (name, email)
# author_date (datetime): authored date
# author_timezone (int): author timezone (expressed in seconds from epoch)
# committer_date (datetime): commit date
# committer_timezone (int): commit timezone (expressed in seconds from epoch)
# branches (List[str]): List of branches that contain this commit
# in_main_branch (Bool): True if the commit is in the main branch
# merge (Bool): True if the commit is a merge commit
# modified_files (List[ModifiedFile]): list of modified files in the commit (see ModifiedFile)
# parents (List[str]): list of the commit parents
# project_name (str): project name
# project_path (str): project path
# deletions (int): number of deleted lines in the commit (as shown from –shortstat).
# insertions (int): number of added lines in the commit (as shown from –shortstat).
# lines (int): total number of added + deleted lines in the commit (as shown from –shortstat).
# files (int): number of files changed in the commit (as shown from –shortstat).
# dmm_unit_size (float): DMM metric value for the unit size property.
# dmm_unit_complexity (float): DMM metric value for the unit complexity property.
# dmm_unit_interfacing (float): DMM metric value for the unit interfacing property.
commits = []
for commit in repo.traverse_commits():
    c = {
        "hash": commit.hash,
        "msg": commit.msg,
        "author": commit.author.name,
        "author_date": commit.author_date,
        "committer_date": commit.committer_date,
        "branches": commit.branches,
        "in_main_branch": commit.in_main_branch,
        "merge": commit.merge,
        "modified_files": commit.modified_files,
        "parents": commit.parents,
        "project_name": commit.project_name,
        "project_path": commit.project_path,
        "deletions": commit.deletions,
        "insertions": commit.insertions,
        "lines": commit.lines,
        "files": commit.files,
        "dmm_unit_size": commit.dmm_unit_size,
        "dmm_unit_complexity": commit.dmm_unit_complexity,
        "dmm_unit_interfacing": commit.dmm_unit_interfacing,
    }
    commits.append(c)

In [None]:
df_commits = pd.DataFrame(commits)
df_commits.head(2).T

In [None]:
df_commits['author'].unique()

In [None]:
df_commits['author'].value_counts()

In [None]:
df_commits['len_msg'] = df_commits['msg'].apply(lambda x: len(x))

In [None]:
df_commits['branches'].value_counts()

In [None]:
px.scatter(
    df_commits, x="author_date", y="len_msg", color="author", template="plotly_dark"
)

In [None]:
px.violin(df_commits, y='len_msg', color='author', box=True, points='all', template='plotly_dark')

In [None]:
px.line(
    df_commits, x="author_date", y="insertions", color="author", template="plotly_dark"
).update_traces(mode="markers+lines")

In [None]:
df_commits['author_date'] = pd.to_datetime(df_commits['author_date'], utc=True)

df_commits_per_month = df_commits.set_index('author_date').groupby(pd.Grouper(freq='M')).count().reset_index()
px.line(df_commits_per_month, x='author_date', y='hash', template='plotly_dark')

In [None]:
df_commits['DOW'] = df_commits['author_date'].dt.day_name()
df_commits['hour_of_day'] = df_commits['author_date'].dt.hour
df_commit_times = df_commits.groupby(['DOW', 'hour_of_day']).count()['hash'].sort_values(ascending=False).reset_index()
px.density_heatmap(df_commit_times, y='hour_of_day', x='DOW', z='hash', template='plotly_dark', category_orders={'DOW': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']})