In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime

In [None]:
# Load the attached CSV file
df = pd.read_csv('../FinalDataset/kubernetes_danielsmith_final.csv')
df.tail()

In [None]:
# Convert 'Commit Date' to datetime, handling the specific format and timezone
#df['Commit Date'] = pd.to_datetime(df['Commit Date'], format='%a %b %d %H:%M:%S %Y %z')
df['Commit Date'] = pd.to_datetime(df['Commit Date'])

# Sort by 'Commit Date'
df.sort_values('Commit Date', inplace=True)
df.head()

## Seniority Metric

In [None]:
# Set the 'Seniority' column for the first row to 0
df.loc[df.index[0], 'Seniority'] = 0
df.head()

In [None]:
# Assuming 'Commit Date' has been correctly converted to datetime
first_date = df.iloc[0]['Commit Date']

# Use apply to calculate the difference in days for each row
df['Seniority'] = df['Commit Date'].apply(lambda x: (x - first_date).days)

df.tail()

In [None]:
# df.to_csv('../FinalDataset/gitlab_stanhu_final.csv', index=False)

## Commit Metric

In [None]:
# Process the DataFrame
# When the row is 'Merge' and the current commit hash is the same as the previous, don't increment the 'Commit' value
commit_counter = 0
for index, row in df.iterrows():
    if index == 0 or row['Type'] != 'Merge' or row['Current Commit'] != df.loc[index - 1, 'Current Commit']:
        # Increment the commit counter if it's the first row, a normal row, or a merge with a different commit hash
        commit_counter += 1
    df.at[index, 'Commit'] = commit_counter

In [None]:
df.head()

In [None]:
# df.to_csv('../FinalDataset/homeassistant_jnickkoston_final.csv', index=False)

## NCLOC Accumulation Per Commit

In [None]:
df['lines'] = df['lines_current'] - df['lines_parent']
df.head()

## NCLOC Accumulation So Far

In [None]:
# Initialize the 'cumulative_final_td' column
df['cumulative_lines'] = 0

for i in range(len(df)):
    if i != 0:
        df.iloc[i, df.columns.get_loc('cumulative_lines')] = df.iloc[i - 1, df.columns.get_loc('cumulative_lines')] + df.iloc[i, df.columns.get_loc('lines')]
    else:
        df.iloc[i, df.columns.get_loc('cumulative_lines')] = df.iloc[i, df.columns.get_loc('lines')]

df.head()

## TD Accumulation Per Commit

In [None]:
# Calculate the 'final_td' column by subtracting 'sqale_index_parent' from 'sqale_index_current'
df['sqale_index'] = df['sqale_index_current'] - df['sqale_index_parent']
df.head()

## TD Accumulation So Far

In [None]:
# Initialize the 'cumulative_final_td' column
df['cumulative_sqale_index'] = 0

# Calculate the cumulative sum of 'final_td'
for i in range(len(df)):
    if i != 0:
        df.iloc[i, df.columns.get_loc('cumulative_sqale_index')] = df.iloc[i - 1, df.columns.get_loc('cumulative_sqale_index')] + df.iloc[i, df.columns.get_loc('sqale_index')]
    else:
        df.iloc[i, df.columns.get_loc('cumulative_sqale_index')] = df.iloc[i, df.columns.get_loc('sqale_index')]

df.head()

In [None]:
df.to_csv("../FinalDataset/kubernetes_danielsmith_final.csv", index=False)