In [None]:
import pandas as pd

In [None]:
COLS = ["commit","parents","author_time","commit_time","plus_diff","minus_diff","tags"]

pd.DataFrame()

In [12]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import json

def generate_mock_commit_data(num_commits):
    """
    Generates mock data for a set of Git commits.

    Args:
        num_commits (int): The number of mock commits to generate.

    Returns:
        pandas.DataFrame: A DataFrame with mock commit data.
    """

    data = []
    commit_index = {}
    # Generate unique commit hashes
    commit_hashes = [f"{i:07x}" for i in range(num_commits)]

    generated_parent = False

    for i in range(num_commits):
        commit = commit_hashes[i] 

        # Parents:
        num_parents = random.choice([1,2]) if generated_parent else 0
        if num_parents == 0:
            generated_parent = True
        parents = []
        if num_parents > 0 and i > 0:
            # Ensure parents are from previously generated commits
            possible_parents = commit_hashes[:i]
            parents = random.sample(possible_parents, min(num_parents, len(possible_parents)))
        
        # Author and Commit Time:
        # Start from a recent past date and go backwards
        if num_parents == 0:
            commit_time = datetime.now() - timedelta(days=random.randint(0, 365), 
                                                    hours=random.randint(0, 23), 
                                                    minutes=random.randint(0, 59))
        else:
            parent_time = commit_index[parents[-1]] 
            commit_time = parent_time + timedelta(days=random.randint(0, 15), 
                                                    hours=random.randint(0, 23), 
                                                    minutes=random.randint(0, 59))
        author_time = commit_time - timedelta(minutes=random.randint(0, 60)) # Author time can be slightly before commit time

        commit_index[str(commit)] = commit_time

        # Plus/Minus Diff:
        plus_diff = random.randint(0, 500)
        minus_diff = random.randint(0, 300)

        # Tags (Author/Committer):
        authors = ["Alice", "Bob", "Charlie", "David", "Eve"]
        committers = ["Alice", "Bob", "Charlie", "David", "Eve"]
        tags = {
            "author": random.choice(authors),
            "committer": random.choice(committers)
        }

        data.append({
            "commit": commit,
            "parents": parents,
            "author_time": author_time,
            "commit_time": commit_time,
            "plus_diff": plus_diff,
            "minus_diff": minus_diff,
            "tags": json.dumps(tags)  # Store JSON as a string
        })

    df = pd.DataFrame(data)
    return df

In [13]:
generate_mock_commit_data(50).to_csv("mock1.csv",index=False)

In [16]:
loaded_mock = pd.read_csv("mock1.csv")
loaded_mock['parents']

0                         []
1                ['0000000']
2     ['0000000', '0000001']
3     ['0000001', '0000002']
4                ['0000003']
5                ['0000004']
6     ['0000004', '0000003']
7     ['0000003', '0000001']
8     ['0000002', '0000006']
9                ['0000003']
10               ['0000001']
11    ['0000001', '0000002']
12    ['0000004', '0000005']
13               ['0000000']
14    ['000000c', '0000007']
15               ['0000000']
16               ['0000008']
17    ['0000005', '0000007']
18    ['0000006', '0000009']
19    ['0000001', '0000011']
20    ['0000000', '0000013']
21    ['0000012', '0000014']
22               ['0000008']
23    ['0000009', '000000a']
24    ['0000006', '0000007']
25    ['0000002', '0000009']
26    ['0000009', '0000005']
27    ['0000013', '000001a']
28               ['0000013']
29    ['0000003', '0000016']
30               ['000001d']
31               ['000001d']
32               ['000001a']
33    ['000001d', '000001a']
34            