### Understanding what drives tool usage

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from datetime import datetime, timezone
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from ast import literal_eval


In [None]:
zkp_repos = pd.read_csv('zkp_repos.csv', sep=';')
tool_commits = pd.read_csv('tool_commits.csv')
application_commits = pd.read_csv('application_commits.csv')
tool_issues = pd.read_csv('tool_issues.csv')
repo_contributors = pd.read_csv('repo_contributors.csv')
contributor_data = pd.read_csv('contributor_data.csv')
branches_data = pd.read_csv('branches_data.csv')
application_authors = pd.read_csv('application_authors.csv')

##### Get AppCount

In [None]:
zkp_repos.loc[zkp_repos["Type"] == "Application", "Tool"] = zkp_repos["Tool"].str.replace('[', '').str.replace(']', '').str.replace('\'', '').str.split(', ')
# zkp_repos.loc[zkp_repos["Type"] == "Application", "Tool"] = zkp_repos.loc[
#     zkp_repos["Type"] == "Application", "Tool"
# ]
zkp_repos_exploded = zkp_repos.explode('Tool')
# zkp_repos_exploded.reset_index(drop=True, inplace=True)

In [None]:
tool_counts = zkp_repos_exploded["Tool"].value_counts().reset_index()

tool_counts.columns = ["Tool", "AppCount"]
tool_counts = tool_counts.merge(
    zkp_repos[zkp_repos["Type"] == "Tool"], left_on="Tool", right_on="Name", how="outer"
)[["UniqueID", "AppCount"]]
tool_counts

## Issues over time

In [None]:
tool_issues_over_time = (
    tool_issues[["CreatedYearMonth", "UniqueID", "Name"]]
    .groupby(["UniqueID", "CreatedYearMonth"])
    .count()
    .reset_index()
    .rename(columns={"Name": "IssueCount"})
)
tool_issues_over_time["CreatedYearMonth"] = tool_issues_over_time[
    "CreatedYearMonth"
].astype("datetime64[ns]")

px.line(
    tool_issues_over_time,
    x="CreatedYearMonth",
    y="IssueCount",
    color="UniqueID",
    title="Issues over time",
    template="plotly_dark",
)

From the above graph we see that some of the tools have a big surge in issues in the early days of the repo (usually a year or so) after which it seems to stabalize a bit, we can potenially consider using the first year issues to total issues ratio as a normalised metric to understand how much issues a tool has over time.

#### App Count & Issue Resolution

In [None]:
issue_counts = tool_issues.groupby(["UniqueID", "State"]).size().unstack(fill_value=0)
issue_counts.reset_index(inplace=True)
issue_counts.columns = ["UniqueID", "Closed", "Open"]
issue_counts["IssueResolutionRate"] = (issue_counts["Closed"]) / (
    issue_counts["Closed"] + issue_counts["Open"]
)
issue_counts.sort_values("IssueResolutionRate", ascending=False, inplace=True)

merged_tools_issues = (
    issue_counts[["UniqueID", "IssueResolutionRate"]]
    .merge(tool_counts, left_on="UniqueID", right_on="UniqueID", how="outer")
    .sort_values("AppCount")
)

fig = px.scatter(
    merged_tools_issues,
    x="IssueResolutionRate",
    y="AppCount",
    color="UniqueID",
    template="plotly_dark",
    title="Application Usage and Issue Resolution Rate",
)

fig.show()

Why is the resolutation rate so low for: 


|    | UniqueID                  |   IssueResolutionRate |   AppCount |
|---:|:--------------------------|----------------------:|-----------:|
| 38 | libsnark/scipr-lab        |              0.423645 |         64 |
| 39 | cairo-lang/starkware-libs |              0.341176 |        110 |
| 37 | circomlib/iden3           |              0.446602 |        167 |


In [None]:
issue_counts = tool_issues.groupby(["UniqueID", "State"]).size().unstack(fill_value=0)
issue_counts.reset_index(inplace=True)
issue_counts.columns = ["UniqueID", "Closed", "Open"]
issue_counts["IssueResolutionRate"] = (issue_counts["Closed"]) / (
    issue_counts["Closed"] + issue_counts["Open"]
)
issue_counts.sort_values("IssueResolutionRate", ascending=False, inplace=True)

merged_tools_issues = (
    issue_counts[["UniqueID", "IssueResolutionRate"]]
    .merge(tool_counts, left_on="UniqueID", right_on="UniqueID", how="outer")
    .sort_values("AppCount")
)

fig = px.scatter(
    merged_tools_issues,
    x="IssueResolutionRate",
    y="AppCount",
    template="plotly_dark",
    title="Application Usage and Issue Resolution Rate",
    trendline="ols",
    hover_data=["UniqueID"],
)

fig.show()

From the graph we can see that there is no clear correlation between the issue resolution rate and the number of apps that use it.

#### App Count & Issues Opened per Month

In [None]:
monthly_opened_issues = tool_issues

monthly_opened_issues["CreatedAt"] = pd.to_datetime(monthly_opened_issues["CreatedAt"])
monthly_opened_issues["CreatedYearMonth"] = monthly_opened_issues[
    "CreatedAt"
].dt.to_period("M")
monthly_opened_issues = monthly_opened_issues[["UniqueID", "CreatedYearMonth"]].rename(
    columns={"CreatedYearMonth": "YearMonth", "UniqueID": "UniqueID"}
)
monthly_opened_issues = (
    monthly_opened_issues.groupby(["UniqueID", "YearMonth"])
    .size()
    .reset_index(name="OpenedCount")
)
monthly_opened_issues = (
    monthly_opened_issues.groupby("UniqueID")["OpenedCount"]
    .mean()
    .reset_index(name="AverageOpenedPerMonth")
)
monthly_opened_issues = monthly_opened_issues.merge(
    tool_counts, left_on="UniqueID", right_on="UniqueID", how="left"
)

fig = px.scatter(
    monthly_opened_issues,
    x="AverageOpenedPerMonth",
    y="AppCount",
    color="UniqueID",
    template="plotly_dark",
    title="Application Usage and Issues Opened per Month",
)

fig.show()

In [None]:
fig = px.scatter(monthly_opened_issues, x='AverageOpenedPerMonth', y='AppCount', 
                template="plotly_dark",
                title='Application Usage and Issues Opened per Month',
                hover_data=['UniqueID'],
                trendline='ols')
fig.show()
                

Here we can see that tools with a low number of issues tend to be used in quite a few apps except for `cairo/starkware-libs` why is that?

In [None]:
monthly_closed_issues = tool_issues

monthly_closed_issues["ClosedAt"] = pd.to_datetime(monthly_closed_issues["ClosedAt"])
monthly_closed_issues["ClosedYearMonth"] = monthly_closed_issues[
    "ClosedAt"
].dt.to_period("M")
monthly_closed_issues = monthly_closed_issues[["UniqueID", "ClosedYearMonth"]].rename(
    columns={"ClosedYearMonth": "YearMonth", "UniqueID": "UniqueID"}
)
monthly_closed_issues = (
    monthly_closed_issues.groupby(["UniqueID", "YearMonth"])
    .size()
    .reset_index(name="ClosedCount")
)
monthly_closed_issues = (
    monthly_closed_issues.groupby("UniqueID")["ClosedCount"]
    .mean()
    .reset_index(name="AverageClosedPerMonth")
)
monthly_closed_issues = monthly_closed_issues.merge(
    tool_counts, left_on="UniqueID", right_on="UniqueID", how="left"
)

fig = px.scatter(
    monthly_closed_issues,
    x="AverageClosedPerMonth",
    y="AppCount",
    color="UniqueID",
    template="plotly_dark",
    title="Application Usage and Issues Closed per Month",
)

fig.show()

In [None]:
fig = px.scatter(
    monthly_closed_issues.merge(
        monthly_opened_issues, left_on="UniqueID", right_on="UniqueID", how="left"
    ),
    x="AverageClosedPerMonth",
    y="AverageOpenedPerMonth",
    # color="UniqueID",
    template="plotly_dark",
    title="Application Usage and Issues Closed per Month",
    trendline="ols",
)
fig.show()

From the tendline above we can deduce that all these tool repos are activily being maintained as on average number of tickets being opened is matched by the number of tickets being closed per month.

#### App Count & Language

In [None]:
language_tool_counts = tool_counts.merge(
    zkp_repos[zkp_repos["Type"] == "Tool"],
    left_on="UniqueID",
    right_on="UniqueID",
    how="right",
)[["UniqueID", "AppCount", "Language"]]
language_tool_counts

fig = px.scatter(
    language_tool_counts,
    x="UniqueID",
    y="AppCount",
    color="Language",
    template="plotly_dark",
    title="Application Usage and Issue Resolution Rate",
)

fig.show()

Why is Rust such a preffered language for these tools? 

@itsybitsycyber confirm that these tools don't use arcworks in the background as they should then be classified as Applications.

In [None]:
fig = px.bar(
    language_tool_counts.groupby("Language").sum().reset_index().sort_values("AppCount"),
    x="Language",
    y="AppCount",
    color="Language",
    template="plotly_dark",
    title="Total Application Usage by Language",
)

fig.show()

#### App Count & Contributors

In [None]:
repo_contributors = repo_contributors[repo_contributors['UniqueID'].isin(tool_counts['UniqueID'])]
contributor_counts = repo_contributors.groupby('UniqueID')['Contributor'].nunique().reset_index()
contributor_counts.rename(columns={'Contributor': 'ContributorCount'}, inplace=True)
contributor_counts = contributor_counts.merge(tool_counts, left_on='UniqueID', right_on='UniqueID')
contributor_counts.sort_values(by=['ContributorCount'], ascending=False, inplace=True)

fig = px.scatter(contributor_counts, x='ContributorCount', y='AppCount', 
             title='Application Usage and Contributor Count',
             template="plotly_dark",
             color='UniqueID',
             labels={'ContributorCount': 'No. of Contributors', 'UniqueID': 'Repository'}
             )
fig.show()

In [None]:
fig = px.scatter(contributor_counts, x='ContributorCount', y='AppCount', 
             title='Application Usage and Contributor Count',
             template="plotly_dark",
             trendline='ols',
             labels={'ContributorCount': 'No. of Contributors', 'UniqueID': 'Repository'}
             )
fig.show()

@itsybitsycyber Add the number of authors count to the graph above.

#### App Count & Tool Age

In [None]:
tool_age = zkp_repos[zkp_repos['Type'] == 'Tool']
tool_age['Created'] = pd.to_datetime(tool_age['Created'])
tool_age['Age'] = (datetime.now(timezone.utc) - tool_age['Created']).dt.days
tool_age = tool_age[['UniqueID', 'Age']]
tool_age = tool_age.merge(tool_counts, left_on='UniqueID', right_on='UniqueID', how='left')

fig = px.scatter(tool_age, x='Age', y='AppCount', 
             title='Application Usage and Age',
             template="plotly_dark",
             color='UniqueID',
             labels={'UniqueID': 'Repository'}
             )
fig.show()

In [None]:
fig = px.scatter(tool_age, x='Age', y='AppCount', 
             title='Application Usage and Age',
             template="plotly_dark",
             trendline='ols',
             labels={'UniqueID': 'Repository'}
             )
fig.show()

There is no significant correlation between the age of the tool and the number of apps that use it.

#### App Count & Commit Count

In [None]:
total_commits = tool_commits
total_commits = total_commits.groupby('UniqueID')['CommitHash'].nunique().reset_index()
total_commits.columns = ['UniqueID', 'CommitCount']
total_commits = total_commits.merge(tool_counts, left_on='UniqueID', right_on='UniqueID', how='right')

fig = px.scatter(total_commits, x='CommitCount', y='AppCount', 
             title='Application Usage and CommitCount',
             template="plotly_dark",
             color='UniqueID',
             labels={'UniqueID': 'Repository'}
             )
fig.show()

Why does `zksync/matter-lab` have so many commits and so few apps using it?

#### App Count & Commit Frequency

In [None]:
commit_frequency = tool_commits
commit_frequency["CommitterDate"]
commit_frequency["CommitterDate"] = pd.to_datetime(
    commit_frequency["CommitterDate"], utc=True
)
commit_frequency["YearMonth"] = commit_frequency["CommitterDate"].dt.to_period("M")
commit_frequency = (
    commit_frequency.groupby(["UniqueID", "YearMonth"])
    .size()
    .reset_index(name="CommitCount")
)
commit_frequency = (
    commit_frequency.groupby("UniqueID")["CommitCount"]
    .mean()
    .reset_index(name="AverageCommitFrequency")
)
commit_frequency = commit_frequency.merge(
    tool_counts, left_on="UniqueID", right_on="UniqueID", how="left"
)

fig = px.scatter(
    commit_frequency,
    x="AverageCommitFrequency",
    y="AppCount",
    title="Application Usage and Average Number Of Commits per Month",
    template="plotly_dark",
    color="UniqueID",
    labels={"UniqueID": "Repository"},
)
fig.show()

Why do these repo's have so many commits per month?
|    | UniqueID             |   AverageCommitFrequency |   AppCount |
|---:|:---------------------|-------------------------:|-----------:|
|  5 | cairo/starkware-libs |                  229.471 |        343 |
| 16 | leo/aleohq           |                  140.14  |          7 |
| 23 | openzkp/0xproject    |                  100.19  |          1 |
| 25 | plonky2/mir-protocol |                  141.156 |         17 |
| 38 | zksync/matter-labs   |                  265.065 |          4 |

@itsybitsycyber are we using the commits as we think? Are these all commits in the repo, or just commits to main? 

#### App Count & Branch Count

In [None]:
branch_count = branches_data.groupby('UniqueID').size().reset_index(name='BranchCount')
branch_count = branch_count.merge(tool_counts, left_on='UniqueID',  right_on='UniqueID', how='left')

fig = px.scatter(branch_count, x='BranchCount', y='AppCount', 
             title='Application Usage and BranchCount',
             template="plotly_dark",
             color='UniqueID',
             labels={'UniqueID': 'Repository'}
             )
fig.show()

@itsybitsycyber can you idenitfy the branching strategies that these repos use?

#### App Count & Active Committers

In [None]:
active_committers = tool_commits
active_committers["CommitterDate"] = pd.to_datetime(
    active_committers["CommitterDate"], utc=True
)
active_committers = active_committers.loc[
    active_committers["CommitterDate"].dt.year == 2023
]
active_committers

In [None]:
active_committers = tool_commits
active_committers["CommitterDate"] = pd.to_datetime(
    active_committers["CommitterDate"], utc=True
)
active_committers = active_committers.loc[
    active_committers["CommitterDate"].dt.year == 2023
]
active_committers = active_committers.merge(
    zkp_repos.loc[zkp_repos["Type"] == "Tool"],
    left_on="UniqueID",
    right_on="UniqueID",
    how="right",
)
active_committers = (
    active_committers.groupby("UniqueID").agg({"Committer": "nunique"}).reset_index().rename(columns={"Committer": "ActiveCommitters"})
)
active_committers = active_committers.merge(
    tool_counts, left_on="UniqueID", right_on="UniqueID", how="left"
)

fig = px.scatter(
    active_committers,
    x="ActiveCommitters",
    y="AppCount",
    title="Application Usage and Active Committers",
    template="plotly_dark",
    color="UniqueID",
    labels={"UniqueID": "Repository"},
)
fig.show()

In [None]:
fig = px.scatter(
    active_committers,
    x="ActiveCommitters",
    y="AppCount",
    title="Application Usage and Active Committers",
    template="plotly_dark",
    trendline="ols",
    hover_data=["UniqueID"],
    labels={"UniqueID": "Repository"},
)
fig.show()

Why has nobody commited to these repos in the last 8 months? 

|    | UniqueID                  |   ActiveCommitters |   AppCount |
|---:|:--------------------------|-------------------:|-----------:|
|  3 | bulletproofs/sdiehl       |                  0 |          1 |
|  8 | circomlib/iden3           |                  0 |        167 |
| 17 | libsnark/scipr-lab        |                  0 |         64 |
| 18 | marlin/arkworks-rs        |                  0 |          4 |
| 19 | merlin/dalek-cryptography |                  0 |         53 |
| 22 | nonnative/arkworks-rs     |                  0 |          2 |
| 23 | openzkp/0xproject         |                  0 |          1 |
| 24 | plonky/mir-protocol       |                  0 |          1 |
| 28 | pysnark/charterhouse      |                  0 |          4 |

#### App Count & New Committers

@itsybitsycyber revise the below calculation, we'll have to get the set of contibutors that contibuted to these tools prior to 2023 and subset those after to get a count of who is new to each tool

In [None]:

new_committers = tool_commits

new_committers["CommitterDate"] = pd.to_datetime(new_committers["CommitterDate"])
new_committers = new_committers[new_committers["CommitterDate"].dt.year == 2023]
new_committers = (
    new_committers.groupby(["UniqueID", "Committer"])["CommitterDate"]
    .min()
    .reset_index()
)
new_committers = new_committers[new_committers["CommitterDate"].dt.year == 2023]
new_committers = (
    new_committers.groupby("UniqueID")["Committer"]
    .count()
    .reset_index(name="NewCommitterCount")
)
new_committers = new_committers.merge(
    tool_counts, left_on="UniqueID", right_on="UniqueID", how="right"
).fillna(0)

fig = px.scatter(
    new_committers,
    x="NewCommitterCount",
    y="AppCount",
    title="Application Usage and New Committers",
    template="plotly_dark",
    color="UniqueID",
    labels={"UniqueID": "Repository"},
)
fig.show()

#### App Count & Recent Commits

In [None]:
new_commits = tool_commits

new_commits["CommitterDate"] = pd.to_datetime(new_commits["CommitterDate"], utc=True)
new_commits = new_commits[new_commits["CommitterDate"].dt.year == 2023]
new_commits = new_commits.groupby("UniqueID").size().reset_index(name="NewCommits")
new_commits = new_commits.merge(
    tool_counts, left_on="UniqueID", right_on="UniqueID", how="right"
).fillna(0)

fig = px.scatter(
    new_commits,
    x="NewCommits",
    y="AppCount",
    title="Application Usage and New Committs",
    template="plotly_dark",
    color="UniqueID",
    labels={"UniqueID": "Repository"},
)
fig.show()

#### App Count and Tool Type 

In [None]:
tools = zkp_repos[zkp_repos['Type'] == 'Tool'][['UniqueID', 'ToolType']]
tools_types = tool_counts.merge(tools, left_on='UniqueID',  right_on='UniqueID', how='right')
tools_types.sort_values(by='AppCount', inplace=True, ascending=False)

fig = px.scatter(tools_types, x='UniqueID', y='AppCount', 
             title='Application Usage and Tool Type',
             template="plotly_dark",
             color='ToolType',
             labels={'UniqueID': 'Repository'},
             category_orders={'UniqueID': tools_types['UniqueID']}
             )
fig.show()


In [None]:
fig = px.bar(tools_types.groupby('ToolType').sum().reset_index().sort_values(by='AppCount', ascending=False),
             x='ToolType', y='AppCount',
                title='Total Application Usage by Tool Type',   
                template="plotly_dark",
                color='ToolType',
                labels={'ToolType': 'Tool Type'}
)
fig.show()

#### Tools Used In Combination

In [None]:
from itertools import combinations
from collections import Counter

zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']
tool_combinations = zkp_applications['Tool'].apply(lambda x: list(combinations(x, 2)))
flat_combinations = [tuple(item) for sublist in tool_combinations for item in sublist]
combination_counts = Counter(flat_combinations)
result_df = pd.DataFrame(list(combination_counts.items()), columns=['Tool Combination', 'Frequency'])
result_df


In [None]:
import plotly.graph_objects as go
import networkx as nx

zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']

G = nx.Graph()

for tools in zkp_applications['Tool']:
    tool_combinations = list(combinations(tools, 2))
    G.add_edges_from(tool_combinations)

pos = nx.spring_layout(G) 

edge_x = []
edge_y = []

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        color=zkp_applications['Tool'].apply(len),
        showscale=True,
        colorscale='YlGnBu',
        # colorscale='Viridis', 
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        )
    )
)

node_trace.text = zkp_applications['Tool'].apply(lambda tools: ', '.join(tools))

fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                template='plotly_dark',
                height=900,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=0, l=0, r=0, t=0),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)),
                # height=500
                )

fig.show()

Why is the most used combination(s) the most?

+ does one tool provide the FE, the the Back?
+ the one proof system, the other the ...

#### Tools never used in combination

In [None]:
zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']
all_tools = set(tool for sublist in zkp_applications['Tool'] for tool in sublist)
all_tool_combinations = list(combinations(all_tools, 2))
used_tool_combinations = set(tuple(sorted(comb)) for tools in zkp_applications['Tool'] for comb in combinations(tools, 2))
unused_tool_combinations = set(all_tool_combinations) - used_tool_combinations


In [None]:
zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']
tool_combinations = zkp_applications['Tool'].apply(lambda x: list(combinations(x, 2)))
flat_combinations = [tuple(item) for sublist in tool_combinations for item in sublist]
combination_counts = Counter(flat_combinations)
all_tools = set([tool for sublist in zkp_applications['Tool'] for tool in sublist])
tools_in_combinations = set([tool for combination in flat_combinations for tool in combination])
tools_never_used_in_combination = all_tools - tools_in_combinations

tools_never_used_in_combination


@itsybitsycyber why is `{'bulletproofs (sdiehl)', 'plonky3', 'zksync'}` are never used together?

#### Applications & No. of Authors

In [None]:

application_committers = application_commits.groupby('UniqueID')['Author'].nunique().reset_index()
application_committers.rename(columns={'Author': 'AuthorCount'}, inplace=True)
application_committers.sort_values(by=['AuthorCount'], ascending=False, inplace=True)
bins =  int(np.ceil(1 + np.log2(len(application_committers['AuthorCount'])))) # sturges
application_committers['AuthorGroup'] = pd.cut(application_committers['AuthorCount'], bins=bins).astype(str)

# fig = px.scatter(application_committers,  x='UniqueID', y='AuthorCount', 
#                    color='AuthorGroup',
#                    title='Author Count Distribution',
#                    labels={'AuthorGroup': 'Author Count Group', 'count': 'Number of Repos'},
#                    template='plotly_dark', 
#                    height=1000
#                    )

# fig.show()


#### Tools & Application Author Count

In [None]:
# TODO: May need to standardise means
from ast import literal_eval
zkp_applications = zkp_repos[zkp_repos['Type'] == 'Application']
tool_authors = application_committers.merge(zkp_applications, left_on='UniqueID', right_on='UniqueID', how='right')
tool_authors = tool_authors[['UniqueID', 'Tool', 'AuthorCount']]
tool_authors['Tool'] = tool_authors['Tool'].apply(literal_eval)
exploded_tool_authors = tool_authors.explode('Tool')
tool_author_counts = exploded_tool_authors.groupby('Tool')['AuthorCount'].mean().reset_index()
tool_author_counts.sort_values(by='AuthorCount', ascending=False, inplace=True)

fig = px.scatter(tool_author_counts,  x='Tool', y='AuthorCount', 
                #    color='AuthorGroup',
                   title='Mean AuthorCount Per Application using Tool',
                   labels={'AuthorGroup': 'Author Count Group', 'count': 'Number of Repos'},
                   template='plotly_dark', 
                   )

fig.show()

#### Applications & No. of Commits

In [None]:
commit_count = application_commits.groupby('UniqueID')['CommitHash'].nunique().reset_index()
commit_count.rename(columns={'CommitHash': 'CommitCount'}, inplace=True)
commit_count.sort_values(by=['CommitCount'], ascending=False, inplace=True)
bins =  int(np.ceil(1 + np.log2(len(commit_count['CommitCount'])))) # sturges
commit_count['CommitGroup'] = pd.cut(commit_count['CommitCount'], bins=bins).astype(str)
commit_count.groupby('CommitGroup')['UniqueID'].nunique().reset_index()

# fig = px.scatter(commit_count,  x='UniqueID', y='CommitCount', 
#                    color='CommitGroup',
#                    title='Commit Count Distribution',
#                    labels={'CommitGroup': 'Commit Count Group', 'count': 'Number of Repos'},
#                    template='plotly_dark', 
#                    height=1000
#                    )

# fig.show()

### Cluster Applications

#### Get Application Features

In [None]:
# features: tool used, language, stars, forks, watchers, issues, age, author, commitcount, size
zkp_applications = zkp_repos[zkp_repos['Type']=='Application']
application_authors = application_commits.groupby('UniqueID')['Author'].nunique().reset_index()
application_authors.rename(columns={'Author': 'AuthorCount'}, inplace=True)
commit_count = application_commits.groupby('UniqueID')['CommitHash'].nunique().reset_index()
commit_count.rename(columns={'CommitHash': 'CommitCount'}, inplace=True)
application_features = zkp_applications.merge(application_authors, left_on='UniqueID', right_on='UniqueID', how='left')
application_features = application_features.merge(commit_count, left_on='UniqueID', right_on='UniqueID', how='left')
application_features['Age'] = pd.to_datetime(application_features['Created'])
application_features['Age'] = (datetime.now(timezone.utc) - application_features['Age']).dt.days
application_features = application_features[['UniqueID', 'Tool', 'Size', 'Language', 'Stars', 'Forks', 'Watchers', 'Issues', 'AuthorCount', 'CommitCount', 'Age' ]]
application_features =  pd.get_dummies(application_features, columns=['Language'], prefix='Language')


### K-Means Clustering

In [None]:
cols = [
    # 'UniqueID', 

    # 'Tool', 

    'Size', 
    'Age',
    
    'Stars', 
    'Forks', 
    'Watchers', 
    'Issues',
    'AuthorCount', 
    'CommitCount', 


    'Language_Assembly', 
    'Language_C',
    'Language_C++', 
    'Language_CMake', 
    'Language_CSS', 
    'Language_Cairo',
    'Language_Circom', 
    'Language_Cuda', 
    'Language_Dart', 
    'Language_Go',
    'Language_HTML', 
    'Language_Haskell', 
    'Language_Java',
    'Language_JavaScript', 
    'Language_JetBrains MPS',
    'Language_Jupyter Notebook', 
    'Language_Kotlin', 
    'Language_Lean',
    'Language_Makefile', 
    'Language_OCaml', 
    'Language_Python',
    'Language_Racket', 
    'Language_Roff', 
    'Language_Rust', 
    'Language_Shell',
    'Language_Solidity', 
    'Language_Swift', 
    'Language_TypeScript',
    'Language_Vim script', 
    'Language_Vue'
]

nr_components = 2
X = application_features[cols]

# X = application_features.iloc[:, 2:]

scaler = StandardScaler()
df_standardized = scaler.fit_transform(X)

pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_standardized)

kmeans = KMeans(n_clusters=3, random_state=42)
application_features['Cluster'] = kmeans.fit_predict(df_standardized)

components = pca.fit_transform(X)
pca.explained_variance_ratio_

In [None]:
fig = px.scatter(
    application_features,
    x=df_pca[:, 0],
    y=df_pca[:, 1],
    color='Cluster', 
    labels={'color': 'Cluster'},
    title='PCA Scatter Plot with Clusters',
    template='plotly_dark'
)

fig.show()

### Cluster Authors 

#### Get Authors and Features

In [None]:

author_info = application_authors[['Login', 'PublicRepos', 'PublicGists', 'Followers', 'CreatedAt']]
author_info = author_info.merge(application_commits, left_on='Login', right_on='Author', how='left') 
author_commit_count = author_info.groupby('Login')['CommitHash'].nunique().reset_index()
author_commit_count.rename(columns={'CommitHash': 'CommitCount'}, inplace=True)
author_repos_count = author_info.groupby('Login')['UniqueID'].nunique().reset_index()
author_repos_count.rename(columns={'UniqueID': 'RepoCount'}, inplace=True)
author_repos_count
author_info = author_info.merge(author_commit_count, left_on='Login', right_on='Login', how='left')
author_info = author_info.merge(author_repos_count, left_on='Login', right_on='Login', how='left')
author_info['CreatedAt'] = pd.to_datetime(author_info['CreatedAt'])
author_info['AccountAge'] = (datetime.now(timezone.utc) - author_info['CreatedAt']).dt.days
author_info = author_info[['Login',	'PublicRepos',	'PublicGists',	'Followers', 'CommitCount', 'RepoCount', 'AccountAge']]
author_info.drop_duplicates(inplace=True)
author_info = author_info.reset_index(drop=True)


In [None]:
application_authors

#### Cluster Authors using K-Means

In [None]:
cols = [
    'PublicRepos',	
    'PublicGists',	
    'Followers', 
    'CommitCount', 
    'RepoCount', 
    'AccountAge'
]

nr_components = 2

X = author_info[cols]
scaler = StandardScaler()
scaler.fit(X)
df_tools_features_scaled = scaler.transform(X)
pca = PCA(n_components=nr_components)
pca.fit(df_tools_features_scaled)
df_tools_features_pca = pca.transform(df_tools_features_scaled)
df_tools_features_pca = pd.DataFrame(df_tools_features_pca)
df_tools_features_pca['Login'] = author_info["Login"]
df_tools_features_pca = df_tools_features_pca.merge(
    author_info, on="Login", how="left"
)
components = pca.fit_transform(X)

In [None]:
pca.explained_variance_ratio_

In [None]:
px.scatter(
    df_tools_features_pca,
    x=0,
    y=1,
    # color="Login",
    hover_name="Login",
    template="plotly_dark",
    title="PCA of Author Features",
    labels={"0": "PCA1", "1": "PCA2"},
    width=500,
    height=500,
)