In [1]:
import requests
import pandas as pd
import subprocess
import re
from git import Repo
from datetime import datetime
%run functions.py

In [2]:
package_name = "matplotlib"
response = requests.get(f"https://pypi.org/pypi/{package_name}/json")
pypi_data = response.json()

In [3]:
repo_link = pypi_data["info"]["project_urls"]["Source code"]
owner = repo_link.split("/")[3]
repo = repo_link.split("/")[4]

In [4]:
pd.set_option('display.max_rows', None)

## Git quick stats

In [5]:
Repo.clone_from(repo_link, f'./repos/{owner}/{repo}')

<git.repo.base.Repo '/mnt/c/DEV/Forschungsseminar/code/repos/matplotlib/matplotlib/.git'>

Stark unterschiedliche Anzahl der commits abhängig vom Programm

In [5]:
git_quick_stat = subprocess.run(['git-quick-stats', '-T'], capture_output=True, text=True, cwd=f'./repos/{owner}/{repo}')

Doppelte Leute, da unterschiedliche Namen beim commiten angegeben -> Das Problem besteht beim benutzen der GitHub API nicht. -> Teilweise Gelöst mittels group auf E-Mail

In [6]:
data = git_quick_stat.stdout

def parse_contribution_stats(data):
    author_pattern = re.compile(
        r"\s*(.+) <(.+)>:\s*"
        r"insertions:\s*(\d+)\s*\((\d+)%\)\s*"
        r"deletions:\s*(\d+)\s*\((\d+)%\)\s*"
        r"files:\s*(\d+)\s*\((\d+)%\)\s*"
        r"commits:\s*(\d+)\s*\((\d+)%\)\s*"
        r"lines changed:\s*(\d+)\s*\((\d+)%\)\s*"
        r"first commit:\s*(.+)\s*"
        r"last commit:\s*(.+)\s*"
    )

    authors = []
    for match in author_pattern.finditer(data):
        (name, email, insertions, insertions_pct, deletions, deletions_pct, files, files_pct, 
         commits, commits_pct, lines_changed, lines_changed_pct, first_commit, last_commit) = match.groups()
        authors.append({
            'name': name,
            'email': email,
            'insertions': int(insertions),
            'deletions': int(deletions),
            'files': int(files),
            'commits': int(commits),
            'first_commit': datetime.strptime(first_commit, '%a %b %d %H:%M:%S %Y %z'),
            'last_commit': datetime.strptime(last_commit, '%a %b %d %H:%M:%S %Y %z'),
            'lines_changed': int(lines_changed),
        })

    return authors

# Parse the data
authors_data = parse_contribution_stats(data)

# Convert authors data to DataFrame
git_contributors_df = pd.DataFrame(authors_data)
git_contributors_df['email'] = git_contributors_df['email'].str.lower()
git_contributors_df = git_contributors_df.groupby(['email']).agg({'name':'sum', 'insertions':'sum', 'deletions':'sum', 'lines_changed':'sum', 'files':'sum', 'commits':'sum', 'first_commit':'min', 'last_commit':'max'}).reset_index()
git_contributors_df = git_contributors_df.sort_values(by=['commits'], ascending=False)
git_contributors_df = git_contributors_df.reset_index(drop=True)
git_contributors_df = git_contributors_df[['name', 'email', 'insertions', 'deletions', 'lines_changed', 'files', 'commits', 'first_commit', 'last_commit']]

git_contributors_df.head(100)

Unnamed: 0,name,email,insertions,deletions,lines_changed,files,commits,first_commit,last_commit
0,Antony Lee,anntzer.lee@gmail.com,113861,209229,323090,12417,3828,2013-01-04 13:05:35+01:00,2024-05-21 19:05:05+02:00
1,Thomas A CaswellDr. Thomas A Caswell,tcaswell@gmail.com,106494,92774,199268,7277,3513,2012-08-08 15:08:15-05:00,2024-05-15 16:03:26-04:00
2,Michael Droettboom,mdboom@gmail.com,208736,190271,399007,7893,3021,2007-07-03 13:02:53+00:00,2018-11-26 10:32:24-05:00
3,Elliott Sales de Andrade,quantum.analyst@gmail.com,103709,76121,179830,6553,2519,2014-07-06 16:50:50-04:00,2024-05-21 22:05:53-04:00
4,Tim Hoffmann,2836374+timhoffm@users.noreply.github.com,55987,45168,101155,5258,1559,2017-11-13 01:01:44+01:00,2024-05-09 03:14:58+02:00
5,John Hunter,jdh2358@gmail.com,356071,224386,580457,5956,1475,2005-04-11 15:17:26+00:00,2012-06-30 14:02:02-05:00
6,Eric Firing,efiring@hawaii.edu,56323,87645,143968,2842,1297,2005-09-25 23:03:37+00:00,2023-06-05 15:00:48-10:00
7,Jody Klymak,jklymak@gmail.com,48868,25076,73944,3508,1077,2015-06-06 20:45:54-07:00,2024-04-17 09:56:07-07:00
8,David Stansby,dstansby@gmail.com,23832,23061,46893,2812,1039,2016-05-05 12:12:33+01:00,2024-04-07 15:16:58+02:00
9,Jens Hedegaard Nielsen,jenshnielsen@gmail.com,8627,8531,17158,856,575,2011-06-24 14:42:31+02:00,2018-04-23 08:33:51+02:00


## PyPi maintainer (verified)

At the moment no crawler for pypi

Kann keine logins matchen, da ich die aus git nicht bekomme. Aus dem Grund matche ich jetzt logins mit emails.

In [7]:
pypi_supervisors_df = pd.DataFrame({'login': ['ivanov', 'matthew.brett', 'mdboom2'], 'name': [None, 'Matthew Brett', None]})
pypi_supervisors_df

Unnamed: 0,login,name
0,ivanov,
1,matthew.brett,Matthew Brett
2,mdboom2,


In [8]:
result = matching(pypi_supervisors_df, git_contributors_df)
result = result.sort_values(by=['commits'], ascending=False)
result

Unnamed: 0,login,name,rank,insertions,deletions,lines_changed,files,commits,first_commit,last_commit,score
0,ivanov,,35.0,5185.0,2869.0,8054.0,375.0,96.0,2011-01-12 00:49:36+00:00,2019-11-06 08:22:12-08:00,0.5
1,matthew.brett,Matthew Brett,53.0,3191.0,2264.0,5455.0,536.0,56.0,2013-08-21 19:51:54-07:00,2017-10-24 13:05:13+01:00,1.0
2,mdboom2,,,,,,,,,,0.0


## Python Authors (not verified)

In [9]:
python_author_email = pypi_data["info"]["author_email"]
if python_author_email == '':
    python_author_emails = []
else:
    python_author_emails = python_author_email.split(", ")
    python_author_emails

Assuming Author seperation by ,

In [10]:
python_author = pypi_data["info"]["author"]
python_authors = python_author.split(", ")
python_authors

['John D. Hunter', 'Michael Droettboom']

In [11]:
dic = []

if len(python_authors) >= len(python_author_emails):
    i = 0
    while i < len(python_authors):
        if i >= len(python_author_emails):
            dic.append({
                "name": python_authors[i],
                "email": None
            })
        else:
            dic.append({
                "name": python_authors[i],
                "email": python_author_emails[i]
            })
        i += 1
else:
    i = 0
    while i < len(python_author_emails):
        if i >= len(python_authors):
            dic.append({
                "name": None,
                "email": python_author_emails[i]
            })
        else:
            dic.append({
                "name": python_authors[i],
                "email": python_author_emails[i]
            })
        i += 1

python_authors_df = pd.DataFrame(dic)
python_authors_df

Unnamed: 0,name,email
0,John D. Hunter,Unknown <matplotlib-users@python.org>
1,Michael Droettboom,


In [12]:
result = matching(python_authors_df, git_contributors_df)
result = result.sort_values(by=['commits'], ascending=False)
result

Unnamed: 0,name,email,rank,insertions,deletions,lines_changed,files,commits,first_commit,last_commit,score
1,Michael Droettboom,,3.0,208736.0,190271.0,399007.0,7893.0,3021.0,2007-07-03 13:02:53+00:00,2018-11-26 10:32:24-05:00,0.5
0,John D. Hunter,Unknown <matplotlib-users@python.org>,,,,,,,NaT,NaT,0.0


## Python Maintainers (not verified)

In [13]:
python_maintainer_email = pypi_data["info"]["maintainer_email"]
python_maintainer_email

In [14]:
python_maintainer = pypi_data["info"]["maintainer"]
python_maintainer