In [None]:
#
# jupyter notebook to analyse git logs produced in this format:
#   git log --all --numstat --date=short --pretty=format:'--%h--%ad--%aN' --no-renames > highs_logs.txt
#
# The file counts are produced by the following command:
#   cloc ./ --unix --by-file --csv --quiet --report-file=highs_complexity.csv
#

import os
import sys
import argparse
import csv

GIT_LOG_SEPARATOR = '--'

GIT_LOG_FILEPATH = "/Users/hjm/Documents/HiGHS/lp_code/HiGHS-repo/highs_logs.txt"
CLOC_FILEPATH = "/Users/hjm/Documents/HiGHS/lp_code/HiGHS-repo/highs_complexity.csv"

EXCLUDED_AUTHORS = ["Documenter.jl"]
EXCLUDED_FILE_EXTENSIONS = [".cmake", ".html", ".make"]
EXCLUDED_PATH_STARTS = ["brl", ".github"]

In [None]:
def parse_git_log(file_path):
    file_revisions = {}
    current_commit = None

    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if line.startswith(GIT_LOG_SEPARATOR):
                # New commit
                _, commit_hash, commit_date, commit_author = line.split(GIT_LOG_SEPARATOR)
                if commit_author not in EXCLUDED_AUTHORS:
                    current_commit = True
                else:
                    current_commit = False
            elif current_commit:
                # File entry
                no_values = len(line.rsplit(maxsplit=2))
                if no_values == 3:
                    no_added, no_deleted, filepath = line.rsplit(maxsplit=2)
                    _, extension = os.path.splitext(filepath)
                    extension = extension.lower()
                    if (extension not in EXCLUDED_FILE_EXTENSIONS) and (
                        not any(
                            filepath.startswith(excluded)
                            for excluded in EXCLUDED_PATH_STARTS
                        )
                    ):
                        if filepath not in file_revisions:
                            file_revisions[filepath] = { "no_revisions": 0, "no_lines": 0 }
                        file_revisions[filepath]["no_revisions"] += 1

    return file_revisions

In [None]:
def parse_cloc_file(file_revisions, file_path):
    with open(file_path, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            language = row["language"]
            filepath = row["filename"].lstrip("./")
            blank = int(row["blank"])
            comment = int(row["comment"])
            code = int(row["code"])

            if filepath in file_revisions.keys():
                file_revisions[filepath]["no_lines"] = code

In [None]:
def print_statistics(file_revisions):
    print("File Statistics:")
    print("---------------")

    # Sort files by number of revisions in descending order
    longest_filepath = max(len(filepath) for filepath in file_revisions.keys())
    sorted_files = sorted(
        file_revisions.items(), key=lambda x: x[1]["no_revisions"], reverse=True
    )

    print("\nFiles sorted by number of revisions (descending order):")
    print("------------------------------------------------------\n")
    padded_filepath_title = "Filepath".ljust(longest_filepath)
    print(f"{padded_filepath_title}      Revisions  Lines of Code")
    for filepath, stats in sorted_files:
        padded_filepath = filepath.ljust(longest_filepath)
        print(f"{padded_filepath}  {stats['no_revisions']:13d}  {stats['no_lines']:13d}")

In [None]:
file_revisions = parse_git_log(GIT_LOG_FILEPATH)
parse_cloc_file(file_revisions, CLOC_FILEPATH)
print_statistics(file_revisions)