Git Contributions Analysis
===

Date: March 2023

Messing around with using the GitPython package to understand aggregate contributions to a repository.

Useful source: https://www.feststelltaste.de/storing-git-commit-information-into-pandas-dataframe/

In [2]:
import git
import pandas as pd
import pathlib

In [7]:
repos_dir = pathlib.Path.home() / "repos"
assert repos_dir.exists()

In [9]:
repo_name = "dts_imputer"
repo_path = repos_dir / repo_name
assert repo_path.exists()

In [11]:
repo = git.Repo(repo_path)
repo

<git.repo.base.Repo '/Users/zlevonian/repos/dts_imputer/.git'>

In [53]:
branch_names = ["main", "DSSESCREEN-516-modularization"]
ds = {}
for branch_name in branch_names:
    for commit in repo.iter_commits(branch_name):
        if commit.hexsha in ds:
            continue
        py_insertions, py_deletions, py_lines = 0, 0, 0
        if commit.author.name == "Zachary Levonian":
            files = commit.stats.files
            for file, changes in files.items():
                if file.endswith(".py"):
                    py_insertions += changes["insertions"]
                    py_deletions += changes["deletions"]
                    py_lines += changes["lines"]
        
        ds[commit.hexsha] = {
            "raw": commit.hexsha,
            "author_name": commit.author.name,
            **commit.stats.total,
            "py_insertions": py_insertions,
            "py_deletions": py_deletions,
            "py_lines": py_lines,
        }
df = pd.DataFrame(ds.values())
df.shape

(1081, 9)

In [69]:
assert all(df.deletions + df.insertions == df.lines)
df.sample(n=3)

Unnamed: 0,raw,author_name,insertions,deletions,lines,files,py_insertions,py_deletions,py_lines
740,38355ae88943076e64b165b439b69af3d39ec8b0,Jericho Cain,13,0,13,1,0,0,0
703,169f480d5e016d33c0b1bb3b789e72f6d38df04d,Jericho Cain,2,1,3,1,0,0,0
491,f1567a2820c522b02b7dee7167f84ac34ab4167a,Jericho Cain,10,10,20,2,0,0,0


In [55]:
# commit counts to the repository
df.author_name.value_counts().sort_values(ascending=False).reset_index()

Unnamed: 0,index,author_name
0,Jericho Cain,660
1,Zachary Levonian,146
2,rannand84,138
3,BabuNamburi,82
4,madhusudanl,31
5,Mike Powell,19
6,oacai,1
7,q,1
8,root,1
9,vagarwal77,1


In [56]:
# lines changes across all commits, by author
df.groupby("author_name").lines.sum().sort_values(ascending=False).reset_index()

Unnamed: 0,author_name,lines
0,rannand84,186193
1,Zachary Levonian,151326
2,Jericho Cain,95463
3,BabuNamburi,5373
4,madhusudanl,512
5,Mike Powell,170
6,q,149
7,oacai,113
8,vagarwal77,76
9,root,6


In [62]:
sdf = df[df.author_name == "Zachary Levonian"]
sdf.drop(["raw", "author_name"], axis="columns").sum().rename("total").reset_index().rename(columns={"index": "change_type"})

Unnamed: 0,change_type,total
0,insertions,139083
1,deletions,12243
2,lines,151326
3,files,501
4,py_insertions,19660
5,py_deletions,5963
6,py_lines,25623
