# **CS5901 - Assignment 2 - Stage 0**
*This .py file contains the module for local git and GitHub repository management operations*


In [None]:
import os
import subprocess
import tempfile
from google.colab import files
import pandas as pd
import io


# The code in docstring is only for local git repo. Here, we default to GitHub
'''
def init_repo(directory, remote_url = None):
  """
  Initialize a local git repo and GitHub remote setup

  Args:
    directory (str): path to the working directory
    remote_url(str, optional): GitHub repo remote URL

  Returns:
    None
  """

  #Verify if git repo already exists
  if not os.path.exists(os.path.join(directory, '.git')):

    #Initialize new git repo
    subprocess.run(['git', 'init'], cwd = directory, check = True)

    #Create .gitignore file with exclusions
    with open(os.path.join(directory, '.gitignore'), 'w') as f:
      f.write('*csv\n__pycache__/\n*.ipynb_checkpoints/')

    #Stage and commit .gitignore
    subprocess.run(['git', 'add', '.gitignore'], cwd = directory, check = True)
    subprocess.run(['git', 'commit', '-m', 'Initial commit'], cwd = directory, check = True)

    #Set up remote GitHub repo if URL is provided
    if remote_url:
      subprocess.run(['git', 'remote', 'add', 'origin', remote_url], cwd = directory, check = True)


def commit_changes(directory, message):
  """
  Commit changes to the local git repo and push to GitHub

  Args:
    directory (str): path to the working directory
    message (str): commit message

  Returns:
    None
  """

  #Stage and commit changes
  subprocess.run(['git', 'add', '.'], cwd = directory, check = True)

  #Commit with message
  subprocess.run(['git', 'commit', '-m', message], cwd = directory, check = True)

  #Push changes to GitHub (origin must be set)
  try:
    subprocess.run(['git', 'push', 'origin', 'main'], cwd = directory, check = True)
  except:
    print("Failed to push to GitHub. Ensure remote is configured and authenticated.")
'''

def get_git_log_dataframe(repo_url):
    """
    Clones a Git repository, extracts the log, and returns it as a pandas DataFrame.

    Args:
        repo_url (str): The URL of the Git repository.

    Returns:
        pd.DataFrame: A DataFrame containing the git log.
    """
    temp_dir = tempfile.mkdtemp()
    print(f"Created temporary directory: {temp_dir}")

    try:
        # Clone the repository
        print(f"Cloning repository from {repo_url}...")
        subprocess.run(["git", "clone", repo_url, temp_dir], check=True)

        # Change to the repository directory
        os.chdir(temp_dir)

        # Extract the git log
        print("Extracting git log...")
        log_format = "%h%x09%an%x09%ad%x09%s"
        result = subprocess.run(
            ["git", "log", f"--pretty=format:{log_format}"],
            capture_output=True,
            text=True,
            check=True
        )
        git_log_content = result.stdout

        # Read the git log content into a pandas DataFrame
        log_data = io.StringIO(git_log_content)
        df = pd.read_csv(log_data, sep='\t', header=None, names=['Commit Hash', 'Author', 'Date', 'Subject'])

        # Display the DataFrame as a table
        print("Git log in table format:")
        display(df)

        # Read the entire file for download (using the content already captured)
        content = git_log_content

        # Create a new file in the Colab environment
        #with open("git_log.txt", "w") as f:
            #f.write(content)

        # Download the file to the user's local machine
        #files.download("git_log.txt")

        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage:
repo_url = "https://github.com/jsp289/CS5901_Assignment2.git"
git_log_df = get_git_log_dataframe(repo_url)

if git_log_df is not None:
    print("\nDataFrame returned by the function:")
    display(git_log_df.head())

Created temporary directory: /tmp/tmpsga0nerw
Cloning repository from https://github.com/jsp289/CS5901_Assignment2.git...
Extracting git log...
Git log in table format:


Unnamed: 0,Commit Hash,Author,Date,Subject
0,c309398,jsp289,Mon May 12 12:37:52 2025 -0400,Delete CS5901_assignment2_demo_notebook.ipynb
1,9280d24,jsp289,Mon May 12 12:37:40 2025 -0400,Delete CS5901_assignment2_stage1_data_cleaning...
2,53ad314,jsp289,Mon May 12 12:37:27 2025 -0400,Delete CS5901_assignment2_get_utils_py_py.py
3,2a969fa,jsp289,Mon May 12 12:37:17 2025 -0400,Delete CS5901_assignment2_main_py.py
4,8f718e0,jsp289,Mon May 12 12:36:59 2025 -0400,Delete CS5901_assignment2_stage2_time_space_co...
5,9b2c57f,jsp289,Mon May 12 11:58:23 2025 -0400,Created using Colab
6,32ee6bd,jsp289,Mon May 12 11:54:30 2025 -0400,Created using Colab
7,0cab392,jsp289,Mon May 12 11:26:00 2025 -0400,Created using Colab
8,d795c4f,jsp289,Mon May 12 11:22:20 2025 -0400,Created using Colab
9,36d199e,jsp289,Mon May 12 11:21:21 2025 -0400,Created using Colab



DataFrame returned by the function:


Unnamed: 0,Commit Hash,Author,Date,Subject
0,c309398,jsp289,Mon May 12 12:37:52 2025 -0400,Delete CS5901_assignment2_demo_notebook.ipynb
1,9280d24,jsp289,Mon May 12 12:37:40 2025 -0400,Delete CS5901_assignment2_stage1_data_cleaning...
2,53ad314,jsp289,Mon May 12 12:37:27 2025 -0400,Delete CS5901_assignment2_get_utils_py_py.py
3,2a969fa,jsp289,Mon May 12 12:37:17 2025 -0400,Delete CS5901_assignment2_main_py.py
4,8f718e0,jsp289,Mon May 12 12:36:59 2025 -0400,Delete CS5901_assignment2_stage2_time_space_co...
