#### Clean `zkp_repos.csv` 

In [None]:
import pandas as pd
import requests
from pydriller import Repository

In [None]:
df = pd.read_csv('zkp_repos.csv', sep=';')
df = df.rename({"Tool Resources (Twitter, Discord, Website etc.)": "Tool Resources"}, axis='columns')

In [None]:
def split_tools(row):
    if row['Type'] == 'Application':
        return row['Tool'].split(', ')
        

df['Tool'] = df.apply(split_tools, axis=1)


In [None]:
df.to_csv('zkp_repos.csv')

#### Get the commit data and file changes for the 'Tool' repositories using PyDriller

In [None]:
repositories =  df[df['Type'] == 'Tool'].URL.values.tolist()

commit_data = []
file_data = []

for repo_url in repositories:
    for commit in Repository(repo_url).traverse_commits():
            
            commit_data.append({
                'Name': repo_url.split('/')[-1],
                'Owner': repo_url.split('/')[-2],
                'CommitHash': commit.hash,
                'Message': commit.msg,
                'Author': commit.author.name,
                'AuthorEmail': commit.author.email,
                'Committer': commit.committer.name,
                'CommitterEmail': commit.committer.email,
                'AuthorDate': commit.author_date,
                'CommitterDate': commit.committer_date,
                'AuthorTimeZone': commit.author_timezone,
                'CommitterTimeZone': commit.committer_timezone,
                'Branches': commit.branches,
                'Main': commit.in_main_branch,
                'Merge': commit.merge,
                'ModificationCount': len(commit.modified_files),
                'AddedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "ADD"],
                'ModifiedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "MODIFY"],
                'DeletedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "DELETE"],
                'RenamedFiles': [modification.filename for modification in commit.modified_files if modification.change_type.name == "RENAME"],
                'Parents': commit.parents,
                'Deletions': commit.deletions,
                'Insertions': commit.insertions,
                'Lines': commit.lines,
                'Files': commit.files})
            for modified_file in commit.modified_files:
                print(modified_file)
                file_data.append({
                    'Name': repo_url.split('/')[-1],
                    'Owner': repo_url.split('/')[-2],
                    'CommitHash': commit.hash,
                    'Filename': modified_file.filename,
                    'ChangeType': modified_file.change_type.name,
                    'OldPath': modified_file.old_path,
                    'NewPath': modified_file.new_path,
                    'Diff': modified_file.diff,
                    'DiffParser': modified_file.diff_parsed,
                    'AddedLines': modified_file.added_lines,
                    'DeletedLines': modified_file.deleted_lines,
                    # 'SourceCode': modified_file.source_code,
                    # 'SourceCodeBefore': modified_file.source_code,
                    'Methods': modified_file.methods, 
                    'MethodsBefore': modified_file.methods_before,
                    'ChangedMethods': modified_file.changed_methods,
                    'nloc': modified_file.nloc,
                    'Complexity': modified_file.complexity,
                    'TokenCount': modified_file.token_count 
                })


commit_df = pd.DataFrame(commit_data)
file_df = pd.DataFrame(file_data)

In [None]:
commit_df.to_csv('tool_commits.csv')
file_df.to_csv('tool_file_changes.csv')

#### Get the contributor data for each repository

In [None]:
contributor_data = []

access_token = ''

headers = {
    'Authorization': f'token {access_token}',
    'Accept': 'application/vnd.github.v3+json' 
}

for repo in df.URL.values.tolist():

    repo_name = f"{repo.split('/')[-2]}/{repo.split('/')[-1]}"

    page = 1

    while True:
        url = f'https://api.github.com/repos/{repo_name}/contributors?page={page}&per_page=30'
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            contributors = response.json()
            if len(contributors) == 0:
                break 
            for contributor in contributors:
                contributor_data.append({
                    'RepositoryName': repo.split('/')[-1],
                    'Owner': repo.split('/')[-2],
                    'Contributor': contributor['login'],
                    'RepoUrl': repo,
                    'ContributorURL': contributor['url'],
                    'Contributions': contributor['contributions'],
                    'Type': contributor['type'],
                    'SiteAdmin': contributor['site_admin']
                })
            page += 1
        else:
            print(f'Failed to fetch data for {repo_name}, page {page}. Status code: {response.status_code}')
            break  

In [None]:
contributor_df = pd.DataFrame(contributor_data)
contributor_df['RepositoryName'] = contributor_df['RepositoryName'].str.lower()
contributor_df['Contributor'] = contributor_df['Contributor'].str.lower()
contributor_df.to_csv('contributor_data.csv')