#### Clean `zkp_repos.csv` 

In [2]:
import pandas as pd
import requests
import threading
import time
from pydriller import Repository

In [3]:

access_token = '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'

headers = {
    'Authorization': f'token {access_token}',
    'Accept': 'application/vnd.github.v3+json' 
}

In [4]:
df = pd.read_csv('zkp_repos.csv', sep=';')

In [None]:
def split_tools(row):
    if row['Type'] == 'Application':
        return row['Tool'].split(', ')
        

df['Tool'] = df.apply(split_tools, axis=1)


In [None]:
df['UniqueID'] = df['URL'].apply(lambda x: '/'.join(x.split('/')[-2:][::-1]).lower())

In [None]:
# df.to_csv('zkp_repos.csv')

### Get the commit data and file changes for the 'Tool' repositories using PyDriller

In [None]:
repositories =  df[df['Type'] == 'Tool'].URL.values.tolist()

commit_data = []
file_data = []

for repo_url in repositories:
    for commit in Repository(repo_url).traverse_commits():
            
            commit_data.append({
                'UniqueID': f'{repo_url.split("/")[-1]}/{ repo_url.split("/")[-2]}',
                'Name': repo_url.split('/')[-1],
                'Owner': repo_url.split('/')[-2],
                'CommitHash': commit.hash,
                'Message': commit.msg,
                'Author': commit.author.name,
                'AuthorEmail': commit.author.email,
                'Committer': commit.committer.name,
                'CommitterEmail': commit.committer.email,
                'AuthorDate': commit.author_date,
                'CommitterDate': commit.committer_date,
                'AuthorTimeZone': commit.author_timezone,
                'CommitterTimeZone': commit.committer_timezone,
                'Branches': commit.branches,
                'Main': commit.in_main_branch,
                'Merge': commit.merge,
                'ModificationCount': len(commit.modified_files),
                'AddedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "ADD"],
                'ModifiedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "MODIFY"],
                'DeletedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "DELETE"],
                'RenamedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "RENAME"],
                'Parents': commit.parents,
                'Deletions': commit.deletions,
                'Insertions': commit.insertions,
                'Lines': commit.lines,
                'Files': commit.files
                })
            for modified_file in commit.modified_files:
                print(modified_file)
                file_data.append({
                    'UniqueID': f'{repo_url.split("/")[-1]}/{ repo_url.split("/")[-2]}',
                    'Name': repo_url.split('/')[-1],
                    'Owner': repo_url.split('/')[-2],
                    'CommitHash': commit.hash,
                    'Filename': modified_file.filename,
                    'ChangeType': modified_file.change_type.name,
                    'OldPath': modified_file.old_path,
                    'NewPath': modified_file.new_path,
                    'Diff': modified_file.diff,
                    'DiffParser': modified_file.diff_parsed,
                    'AddedLines': modified_file.added_lines,
                    'DeletedLines': modified_file.deleted_lines,
                    # 'SourceCode': modified_file.source_code,
                    # 'SourceCodeBefore': modified_file.source_code,
                    'Methods': modified_file.methods, 
                    'MethodsBefore': modified_file.methods_before,
                    'ChangedMethods': modified_file.changed_methods,
                    'nloc': modified_file.nloc,
                    'Complexity': modified_file.complexity,
                    'TokenCount': modified_file.token_count 
                })


commit_df = pd.DataFrame(commit_data)
file_df = pd.DataFrame(file_data)

In [None]:
commit_df.to_csv('tool_commits.csv')
file_df.to_csv('tool_file_changes.csv')

### Get Application Commits (PyDriller)

In [None]:
def get_commit_data(repos):
    commit_data = []
    
    for repo in repos:
        print(f"Getting commits for {repo}")
        for commit in Repository(repo).traverse_commits():
                commit_data.append({
                    'UniqueID': f'{repo.split("/")[-1]}/{repo.split("/")[-2]}',
                    'Name': repo.split('/')[-1],
                    'Owner': repo.split('/')[-2],
                    'CommitHash': commit.hash,
                    'Message': commit.msg,
                    'Author': commit.author.name,
                    'AuthorEmail': commit.author.email,
                    'Committer': commit.committer.name,
                    'CommitterEmail': commit.committer.email,
                    'AuthorDate': commit.author_date,
                    'CommitterDate': commit.committer_date,
                    'AuthorTimeZone': commit.author_timezone,
                    'CommitterTimeZone': commit.committer_timezone,
                    'Branches': commit.branches,
                    'Main': commit.in_main_branch,
                    'Merge': commit.merge,
                    'ModificationCount': len(commit.modified_files),
                    'AddedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "ADD"],
                    'ModifiedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "MODIFY"],
                    'DeletedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "DELETE"],
                    'RenamedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "RENAME"],
                    'Parents': commit.parents,
                    'Deletions': commit.deletions,
                    'Insertions': commit.insertions,
                    'Lines': commit.lines,
                    'Files': commit.files
                    })
        commit_df = pd.DataFrame(commit_data)
        commit_df.to_csv('application_commit_data.csv', mode='a')
    return commit_data




### Get Application Commits

In [26]:

def get_commit_count(applications):
    commit_data = []
    delay_duration = 60

    for application in applications: 
        repo = application.split('/')[-2]
        owner = application.split('/')[-1]
        page = 1
        
        headers = {'Authorization': f'token {access_token}'}

        while True:
            url = f'https://api.github.com/repos/{owner}/{repo}/commits?page={page}&per_page=30'
            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                commits = response.json()
                
                if len(commits) == 0:
                    break
                # try:
                for commit in commits:
                    commit_data.append({
                        'UniqueID': f'{repo}/{owner}',
                        'Name': repo,
                        'Owner': owner,
                        'CommitHash': commit['sha'],
                        'Message': commit['commit']['message'],
                        'Author': commit['author']['login'] if commit.get('author') and 'login' in commit['author'] else commit['commit']['author']['name'],
                        'AuthorName': commit['commit']['author']['name'],
                        'AuthorEmail': commit['commit']['author']['email'],
                        'AuthorDate': commit['commit']['author']['date'],
                        'AuthorSiteAdmin': commit['author']['site_admin'] if commit.get('author') and 'site_admin' in commit['author'] else None,
                        'AuthorType': commit['author']['type'] if commit.get('author') and 'type' in commit['author'] else None, 
                        'Committer': commit['committer']['login'] if commit.get('committer') and 'login' in commit['committer'] else commit['commit']['committer']['name'],
                        'CommitterName': commit['commit']['committer']['name'],
                        'CommitterEmail': commit['commit']['committer']['email'],
                        'CommitterDate': commit['commit']['committer']['date'],
                        'CommitterSiteAdmin': commit['committer']['site_admin'] if commit.get('committer') and 'site_admin' in commit['committer'] else None,
                        'CommitterType': commit['committer']['type'] if commit.get('committer') and 'type' in commit['committer'] else None,
                        'CommentCount': commit['commit']['comment_count']
                    })

                
                page += 1
                # except:
                #     print(f'Error caught for commit in {url}')
            elif response.status_code == 403:
                print(f'Rate limit exceeded. Waiting for {delay_duration} seconds...')
                time.sleep(delay_duration)

            else:
                print(f'Failed to retrieve user data for commits for {url} due to {response.status_code}. Check the username and API access.')
                break


    return commit_data
    # commit_df = pd.DataFrame(commit_data)
    # commit_df.to_csv('application_commits.csv')

applications = df[df['Type'] == 'Application'].UniqueID.values.tolist()
applications = ['sourceCode-zkSync-rollupContract/LuozhuZhang']
commit_data = get_commit_count(applications)


In [27]:
commit_data_df = pd.DataFrame(commit_data)

In [28]:
commit_data_df.UniqueID.nunique()
original_data = pd.read_csv('application_commits.csv')

In [None]:
# commit_data_df = pd.DataFrame(commit_data)
# commit_data_df.to_csv('application_commits.csv')

In [30]:
# original_data.drop(columns=['Unnamed: 0'], inplace=True)
# combined = pd.concat([original_data, commit_data], ignore_index=True)
combined = original_data._append(commit_data,ignore_index=True)
combined.UniqueID.nunique()
combined

Unnamed: 0.1,Unnamed: 0,UniqueID,Name,Owner,CommitHash,Message,Author,AuthorName,AuthorEmail,AuthorDate,AuthorSiteAdmin,AuthorType,Committer,CommitterName,CommitterEmail,CommitterDate,CommitterSiteAdmin,CommitterType,CommentCount
0,0.0,__archived__medjai/chyanju,__archived__medjai,chyanju,a982520827c7566385dbcf0010d3df36aa93aa54,Update README.md,chyanju,Yanju Chen,chyanju@gmail.com,2022-07-25T18:25:30Z,False,User,web-flow,GitHub,noreply@github.com,2022-07-25T18:25:30Z,False,User,0
1,1.0,__archived__medjai/chyanju,__archived__medjai,chyanju,4e018603ba6c7bc6cca0b6e99b468cbe663e8aa5,Create LICENSE,chyanju,Yanju Chen,chyanju@gmail.com,2022-07-25T18:15:45Z,False,User,web-flow,GitHub,noreply@github.com,2022-07-25T18:15:45Z,False,User,0
2,2.0,__archived__medjai/chyanju,__archived__medjai,chyanju,d961b9954d20dd540be385a85d2fe843207ca567,update docker,chyanju,chyanju,chyanju@gmail.com,2022-07-21T21:09:12Z,False,User,chyanju,chyanju,chyanju@gmail.com,2022-07-21T21:09:12Z,False,User,0
3,3.0,__archived__medjai/chyanju,__archived__medjai,chyanju,701d1aa48995ee8080a95095f53e3133b34debb8,Renaming,JacobVanGeffen,Jacob Van Geffen,jsvangeffen@gmail.com,2022-07-21T17:14:40Z,False,User,JacobVanGeffen,Jacob Van Geffen,jsvangeffen@gmail.com,2022-07-21T17:14:40Z,False,User,0
4,4.0,__archived__medjai/chyanju,__archived__medjai,chyanju,7a27319229ddb35b0a1251c0141e666c1b91c8e6,Basic support for Cairo push-button verification.,JacobVanGeffen,Jacob Van Geffen,jsvangeffen@gmail.com,2022-07-20T23:47:41Z,False,User,JacobVanGeffen,Jacob Van Geffen,jsvangeffen@gmail.com,2022-07-20T23:50:49Z,False,User,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238390,,sourceCode-zkSync-rollupContract/LuozhuZhang,sourceCode-zkSync-rollupContract,LuozhuZhang,313edb7eba90464117f191a37ca40ab5912adb77,Update README.md,LuozhuZhang,Luozhu,70309026+LuozhuZhang@users.noreply.github.com,2022-04-29T12:50:02Z,False,User,LuozhuZhang,Mickey,hedgefund996@gmail.com,2022-04-30T00:40:00Z,False,User,0
238391,,sourceCode-zkSync-rollupContract/LuozhuZhang,sourceCode-zkSync-rollupContract,LuozhuZhang,1dc132559e0b0d71e7524ddf2f88e3decadb2b5c,Update zksync.svg,LuozhuZhang,Mickey,hedgefund996@gmail.com,2022-04-29T12:49:33Z,False,User,LuozhuZhang,Mickey,hedgefund996@gmail.com,2022-04-30T00:40:00Z,False,User,0
238392,,sourceCode-zkSync-rollupContract/LuozhuZhang,sourceCode-zkSync-rollupContract,LuozhuZhang,d874d0a0642a40d01079eeebdbdb637054ee82f2,zksync graph && .gv file,LuozhuZhang,Mickey,hedgefund996@gmail.com,2022-04-30T00:26:11Z,False,User,LuozhuZhang,Mickey,hedgefund996@gmail.com,2022-04-30T00:33:08Z,False,User,0
238393,,sourceCode-zkSync-rollupContract/LuozhuZhang,sourceCode-zkSync-rollupContract,LuozhuZhang,c61216c1fc652e9554d2c0977680eb2e2b51e485,zksync main contract\n\nzkSync继承了五个contract、in...,LuozhuZhang,Mickey,hedgefund996@gmail.com,2022-04-29T15:48:41Z,False,User,LuozhuZhang,Mickey,hedgefund996@gmail.com,2022-04-30T00:33:08Z,False,User,0


In [31]:
combined.to_csv('application_commits.csv')

In [None]:
commit_data = pd.read_csv('application_commits_new.csv')
commit_data.loc[commit_data['UniqueID'] == 'nightfall_3/eyblockchain', 'Name'] = 'nightfall'
commit_data.loc[commit_data['UniqueID'] == 'nightfall_3/eyblockchain', 'UniqueID'] = 'nightfall/eyblockchain'
commit_data.UniqueID.nunique()

### Get Contributors for Each Repository

In [None]:
contributor_data = []

headers = {
    'Authorization': f'token {access_token}',
    'Accept': 'application/vnd.github.v3+json' 
}

for repo in df.URL.values.tolist():

    repo_name = f"{repo.split('/')[-2]}/{repo.split('/')[-1]}"

    page = 1

    while True:
        url = f'https://api.github.com/repos/{repo_name}/contributors?page={page}&per_page=30'
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            contributors = response.json()
            
            if len(contributors) == 0:
                break 

            for contributor in contributors:
                contributor_data.append({
                    'Name': repo.split('/')[-1],
                    'Owner': repo.split('/')[-2],
                    'UniqueID': f'{repo.split("/")[-1]}/{repo.split("/")[-2]}',
                    'Contributor': contributor['login'],
                    'RepoUrl': repo,
                    'ContributorURL': contributor['url'],
                    'Contributions': contributor['contributions'],
                    'Type': contributor['type'],
                    'SiteAdmin': contributor['site_admin']
                })
            page += 1
        else:
            print(f'Failed to fetch data for {repo_name}, page {page}. Status code: {response.status_code}')
            break  

In [None]:
contributor_df = pd.DataFrame(contributor_data)
contributor_df['UniqueID'] = contributor_df['UniqueID'].str.lower()
contributor_df['Contributor'] = contributor_df['Contributor'].str.lower()
contributor_df.to_csv('contributor_data.csv')

### Get the issue data for all tools

In [None]:
issues_data = []

df_tools = df[df['Type'] == 'Tool']
repositories = df_tools['UniqueID'].tolist()


for repo in repositories:
    
    page = 1
    has_more_issues = True
    
    owner = repo.split("/")[1]
    repo_name =  repo.split("/")[0]
    
    print(f"Getting issues for {owner}/{repo_name}")

    while True:
        issues_url = f'https://api.github.com/repos/{owner}/{repo_name}/issues?page={page}&per_page=30'
        response = requests.get(issues_url,  headers=headers) 
        
        if response.status_code == 200:
            issues = response.json()

            if len(issues) == 0:
                break

            for issue in issues:
                issues_data.append({
                    'RepositoryName': repo_name,
                    'Owner': owner,
                    'URL': issue['html_url'],
                    'ApiURL': issue['url'],
                    'RepositoryURL': issue['repository_url'],
                    'User': issue['user']['login'],
                    'UserType': issue['user']['type'],
                    'UserURL': issue['user']['html_url'],
                    'SiteAdmin': issue['user']['site_admin'],
                    'Labels': issue['labels'],
                    'State': issue['state'],
                    'Locked': issue['locked'],
                    'Assignee': issue['assignee'],
                    'Assignees': issue['assignees'],
                    'Comments': issue['comments'],
                    'CreatedAt': issue['created_at'],
                    'ClosedAt': issue['closed_at'],
                    'AuthorAssociation': issue['author_association'],
                    'PullRequestURL': issue['pull_request'] if 'pull_request' in issue else None,
                    'ReactionCount': issue['reactions']['total_count']
                })
        else:
            print(f'Failed to retrieve issue data for {repo}, page {page}.  Status code {response.status_code}.')
        
        page += 1

    page = 1

    while True:
        issues_url = f'https://api.github.com/repos/{owner}/{repo_name}/issues?page={page}&per_page=30&state=closed'
        response = requests.get(issues_url,  headers=headers)

        if response.status_code == 200:
            issues = response.json()

            if len(issues) == 0:
                break

            for issue in issues:
                issues_data.append({
                    'RepositoryName': repo_name,
                    'Owner': owner,
                    'URL': issue['html_url'],
                    'ApiURL': issue['url'],
                    'RepositoryURL': issue['repository_url'],
                    'User': issue['user']['login'],
                    'UserType': issue['user']['type'],
                    'UserURL': issue['user']['html_url'],
                    'SiteAdmin': issue['user']['site_admin'],
                    'Labels': issue['labels'],
                    'State': issue['state'],
                    'Locked': issue['locked'],
                    'Assignee': issue['assignee'],
                    'Assignees': issue['assignees'],
                    'Comments': issue['comments'],
                    'CreatedAt': issue['created_at'],
                    'ClosedAt': issue['closed_at'],
                    'AuthorAssociation': issue['author_association'],
                    'PullRequestURL': issue['pull_request'] if 'pull_request' in issue else None,
                    'ReactionCount': issue['reactions']['total_count']
                })
        else:
            print(f'Failed to retrieve issue data for {repo}, page {page}.  Status code {response.status_code}.')
        
        page += 1

issues_df = pd.DataFrame(issues_data)
issues_df

In [None]:
issues_df.to_csv('tool_issues.csv')

### Get data for each contributor

In [None]:
contributor_df = pd.read_csv('contributor_data.csv')

In [None]:
contributor_df = contributor_df[contributor_df['Contributor'] != 'dependabot[bot]']

In [None]:
contributors = contributor_df['Contributor'].unique()
contributors_data = []

delay_duration = 60

for contributor in contributors:     
    success = False

    while not success:
        contibutor_url = f'https://api.github.com/users/{contributor}'
        response = requests.get(contibutor_url,  headers=headers) 

        if response.status_code == 200:
            contributor_data = response.json()
            
            contributors_data.append({
                'Login': contributor_data['login'],
                'ID': contributor_data['id'],
                'URL': contributor_data['html_url'],
                'Type': contributor_data['type'],
                'SiteAdmin': contributor_data['site_admin'],
                'Name': contributor_data['name'],
                'Bio': contributor_data['bio'],
                'Company': contributor_data['company'],
                'Blog': contributor_data['blog'],
                'Location': contributor_data['location'],
                'Email': contributor_data['email'],
                'Hireable': contributor_data['hireable'],
                'Twitter': contributor_data['twitter_username'],
                'PublicRepos': contributor_data['public_repos'],
                'PublicGists': contributor_data['public_gists'],
                'Followers': contributor_data['followers'],
                'Following': contributor_data['following'],
                'CreatedAt': contributor_data['created_at']
            })

            success = True
        elif response.status_code == 429:
            print(f'Rate limit exceeded. Waiting for {delay_duration} seconds...')
            time.sleep(delay_duration)

        else:
             print(f'Failed to retrieve user data for {contributor} due to {response.status_code}. Check the username and API access.')
             success = True

          


contributor_info_df = pd.DataFrame(contributors_data)
contributor_info_df

In [None]:
contributor_info_df.to_csv('application_contributor_data.csv')

### Get branch data for tools

In [None]:
tools = df[df['Type'] == 'Tool']['UniqueID']

branches_data = []

for tool in tools: 
    page = 1
    has_more_branches = True

    owner = tool.split("/")[1]
    repo_name =  tool.split("/")[0]
    
    print(f"Getting issues for {tool}")

    while True:
        branches_url = f'https://api.github.com/repos/{owner}/{repo_name}/branches?page={page}&per_page=30'
        response = requests.get(branches_url,  headers=headers) 
        
        if response.status_code == 200:
            branches = response.json()

            if len(branches) == 0:
                break

            for branch in branches:
                branches_data.append({
                    'RepositoryName': repo_name,
                    'Owner': owner,
                    'BranchName': branch['name'],
                    'CommitSHA': branch['commit']['sha'],
                    'CommitURL': branch['commit']['url'],
                    'Protected': branch['protected'],
                    'UniqueID': tool
                })
        else:
            print(f'Failed to retrieve branche data for {repo}, page {page}.  Status code {response.status_code}.')
        
        page += 1
    
branches_df = pd.DataFrame(branches_data)
branches_df 

In [None]:
branches_df.to_csv('branches_data.csv')

### Get Application Authors

In [None]:
application_commits = pd.read_csv('application_commits.csv')
application_commits = application_commits.drop(application_commits[application_commits['Author'].str.contains(' ')].index)
old_application_commits = pd.read_csv('old_application_commits.csv')
application_authors = set(application_commits['Author'].to_list()) - set(old_application_commits['Author'].to_list())

authors_data = []

delay_duration = 60

for application_author in application_authors:     
    success = False

    while not success:
        author_url = f'https://api.github.com/users/{application_author}'
        response = requests.get(author_url,  headers=headers) 

        if response.status_code == 200:
            author_data = response.json()
            
            authors_data.append({
                'Login': author_data['login'],
                'ID': author_data['id'],
                'URL': author_data['html_url'],
                'Type': author_data['type'],
                'SiteAdmin': author_data['site_admin'],
                'Name': author_data['name'],
                'Bio': author_data['bio'],
                'Company': author_data['company'],
                'Blog': author_data['blog'],
                'Location': author_data['location'],
                'Email': author_data['email'],
                'Hireable': author_data['hireable'],
                'Twitter': author_data['twitter_username'],
                'PublicRepos': author_data['public_repos'],
                'PublicGists': author_data['public_gists'],
                'Followers': author_data['followers'],
                'Following': author_data['following'],
                'CreatedAt': author_data['created_at']
            })

            success = True

        elif response.status_code == 429:
            print(f'Rate limit exceeded. Waiting for {delay_duration} seconds...')
            time.sleep(delay_duration)

        else:
             print(f'Failed to retrieve user data for {application_author} due to {response.status_code}. Check the username and API access.')
             success = True

        
application_authors_df = pd.DataFrame(authors_data)
application_authors_df



In [None]:
application_authors_df.to_csv('application_authors.csv', mode='a')

In [None]:
repo_contributors = pd.read_csv('repo_contributors.csv')
contributor_data = pd.read_csv('contributor_data.csv')
application_contributors = pd.read_csv('application_contributors.csv')

In [None]:
# append to application_contributors

add_applications = ['cairo/starkware-libs', 'noir/noir-lang', 'starknet-rs/xjonathanlei', 'zokrates/zokrates']

# repo_contributors = repo
# contributor_data[contributor_data['']]
new_contributors = repo_contributors[repo_contributors['UniqueID'].isin(add_applications)]
# new_contributors.Contributor.nunique() #172
contributor_data['Login'] = contributor_data['Login'].str.lower()
new_contributor_data = contributor_data[contributor_data['Login'].isin(new_contributors['Contributor'])]


In [None]:
new_contributor_data.Login.nunique() # 171
application_contributors.Login.nunique() # 2843

In [None]:
application_contributors.iloc[:,1:]

In [None]:
new_contributor_data.iloc[:,1:]

In [None]:
application_contributors
application_contributors_combined = pd.concat([application_contributors.iloc[:,1:], new_contributor_data.iloc[:,1:]], ignore_index=True)
application_contributors_combined.to_csv('application_contributors.csv')

In [None]:
application_authors = pd.read_csv('application_contributors.csv')
application_authors

In [None]:
# protocol-substrate/webb-tools
# zerokit/vacp2p
# contangle-zkcp/timoftime
# fastcrypto/mystenlabs