In [None]:
import datetime
from dateutil import parser as date_parser
import git
from pathlib import Path
from datetime import date
from matplotlib import pyplot as plt
import pandas as pd

plt.style.use("science")
plt.style.use("notebook")

In [None]:
base_path = Path("~/Documents/22/git_sync/hsf-training/").expanduser()
repo_paths = [p for p in base_path.iterdir() if p.is_dir() and (p / ".git").is_dir()]
print(f"{len(repo_paths) = }")

In [None]:
from collections import defaultdict


class CommitCounter:
    def __init__(self):
        self._commit_messages = set()
        #: Unique commits only
        self.timestamps = []
        #: Commits per repo
        self.timestamps_by_repo = defaultdict(list)
        self.duplicates = 0

    def ignore_commits_from(self, repo_path: Path):
        r = git.Repo(repo_path)
        ign = 0
        for commit in r.iter_commits():
            if not commit.message in self._commit_messages:
                ign += 1
            self._commit_messages.add(commit.message)
        print(f"Ignored {ign:,} additional commits")

    def scan_repo(self, repo_path: Path):
        r = git.Repo(repo_path)
        for commit in r.iter_commits():
            cd = date.fromtimestamp(commit.committed_date)
            if commit.message in self._commit_messages:
                self.duplicates += 1
                continue
            self.timestamps_by_repo[repo_path.name].append(cd)
            self.timestamps.append(cd)
            self._commit_messages.add(commit.message)

In [None]:
def dates2series(dates):
    start = min(dates)
    end = max(dates)
    index = [start + i * datetime.timedelta(days=1) for i in range((end - start).days)]
    # terribly inefficient
    values = [len([d for d in dates if d < ind]) for ind in index]
    return index, values

In [None]:
cc = CommitCounter()
cc.ignore_commits_from(Path("~/Documents/22/git_sync/styles/").expanduser())
cc.ignore_commits_from(
    Path("~/Documents/22/git_sync/python-novice-inflammation/").expanduser()
)
cc.ignore_commits_from(Path("~/Documents/22/git_sync/git-novice/").expanduser())
cc.ignore_commits_from(Path("~/Documents/22/git_sync/shell-novice/").expanduser())
cc.ignore_commits_from(
    Path("~/Documents/22/git_sync/git-novice-branch-pr/").expanduser()
)
cc.ignore_commits_from(Path("~/Documents/22/git_sync/workshop-template/").expanduser())
for r in repo_paths:
    cc.scan_repo(r)

In [None]:
print(f"{cc.duplicates = }")

In [None]:
dates, cumm = dates2series(cc.timestamps)

In [None]:
fig, ax = plt.subplots()
ax.plot(dates, cumm)
ax.set_xlabel("Date")
ax.set_ylabel("Number of unique commits")
ax.set_xlim(date_parser.parse("01/01/2019"), datetime.datetime.now())
import matplotlib.dates as mdates

ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
fig.autofmt_xdate()

In [None]:
data_dir = Path(".").resolve().parent / "data"
assert data_dir.is_dir()
pd.Series(cumm, index=dates).to_csv(data_dir / "total_commits.csv")

In [None]:
label_mapping = {
    "ml": "Machine learning",
    "cpluspluscourse": "C++",
    "cmake": "CMake",
    "cicd github": "CI/CD w/ github",
}

In [None]:
fig, ax = plt.subplots()
start_date = date_parser.parse("01/01/2019")
for r, vs in cc.timestamps_by_repo.items():
    # [d for d in vs if d > start_date.date()]
    label = None
    alpha = 0.3
    ls = "-."
    dates, cumm = dates2series(vs)
    if len(vs) >= 90:
        l = r.replace("hsf-training-", "").replace("-webpage", "").replace("-", " ")
        label = label_mapping.get(l, l)
        alpha = 1
        label += rf" ($\Sigma$={max(cumm):,})"
        ls = "-"
    ax.plot(dates, cumm, label=label, alpha=alpha, ls=ls)
ax.set_xlim(start_date, datetime.datetime.now())
ax.legend()
ax.set_ylabel("Number of commits w/o carpentries")
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
fig.autofmt_xdate()

In [None]:
# import numpy as np
#
# fig, ax = plt.subplots()
# start_date = date_parser.parse("01/01/2019")
# baseline = None
# for key, value in cc.timestamps_by_repo.items():
#     if baseline is None:
#         baseline = np.zeros_like(value)
#     dates, cumm = dates2series(vs)
#     label = None
#     ax.stairs(cumm+baseline, [*dates, datetime.datetime.now()], baseline=baseline, label=label, fill=True)
#     baseline = value
# ax.set_xlim(start_date, datetime.datetime.now())
# ax.legend()
# ax.set_ylabel("Number of commits w/o carpentries")
# ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
# fig.autofmt_xdate()