In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from lifelines import CoxPHFitter
import numpy as np
from lifelines import KaplanMeierFitter
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Patch

In [None]:
df = pd.read_csv('../../data/relationships/relations_minified_versioning.csv')
# Remove rows with any NaN values
df = df.dropna()

# Remove dependencies that have the same start and end dates
df = df[df['interval_start_days'] != df['interval_end_days']]
## Data Preprocessing
df['is_out_of_date'] = df['is_out_of_date'].map({'t': True, 'f': False})
df['is_exposed'] = df['is_exposed'].map({'t': True, 'f': False})
df['interval_start_days'] = pd.to_numeric(df['interval_start_days'], errors='coerce')
df = df.dropna(subset=['interval_start_days'])
df['interval_end_days'] = pd.to_numeric(df['interval_end_days'], errors='coerce') ##coerce the errors to drop /N (NA) value in data
df = df.dropna(subset=['interval_end_days'])

df.dtypes

df.head

df.requirement_type.unique()

In [None]:
df.head()

In [None]:
# Sorting has effect on the model since the model expects the data for each individual together
df.sort_values(by=['dependency_id', 'interval_start_days', 'interval_end_days'], inplace=True)
print(df)
print(df.requirement_type.unique())

## Visualizing the Dependency Requirement Types

In [None]:
df.dtypes

# Time Varying Cox Proportional Hazards Model

In [None]:
df = pd.read_csv('../../data/relationships/relations_minified_versioning.csv')

## Data Preprocessing
df['is_out_of_date'] = df['is_out_of_date'].map({'t': True, 'f': False})
df['is_exposed'] = df['is_exposed'].map({'t': True, 'f': False})
df['interval_start_days'] = pd.to_numeric(df['interval_start_days'], errors='coerce')
df = df.dropna(subset=['interval_start_days'])
df['interval_end_days'] = pd.to_numeric(df['interval_end_days'], errors='coerce') ##coerce the errors to drop /N (NA) value in data
df = df.dropna(subset=['interval_end_days'])

## must remove deependencies that have the start and end dates the same
df = df[df['interval_start_days'] != df['interval_end_days']]


df.requirement_type.unique()

In [None]:
# Step 1: Sort by dependency_id and time
df_sorted = df.sort_values(by=['dependency_id', 'interval_start_days'])

# Step 2: Add previous requirement_type per dependency
df_sorted['prev_requirement_type'] = df_sorted.groupby('dependency_id')['requirement_type'].shift(1)

# Step 3: Filter for transitions where type changed
transitions = df_sorted[
    (df_sorted['prev_requirement_type'] == 'floating-major') &
    (df_sorted['requirement_type'] == 'pinning')
]

# Step 4: Count unique dependencies that made this transition
num_deps_transitioned = transitions['dependency_id'].nunique()
print(f"Number of dependencies that transitioned from floating-major to pinning: {num_deps_transitioned}")

In [None]:
# Need to convert the requirement_type column to a categorical type

df['requirement_type'] = pd.Categorical(
    df['requirement_type'],
    categories=['pinning',        # This is the baseline for one-hot encoding
                'floating-major', 
                'floating-minor',
                'floating-patch',
                'fixed-ranging',
                'complex-expression',
                'at-most',
                'or-expression',
                'not-expression'
    ],
    ordered=True
)
df = pd.get_dummies(df, columns=['requirement_type'], drop_first=True)

# Sorting has effect on the model since the model expects the data for each individual together
df.sort_values(by=['dependency_id', 'interval_start_days', 'interval_end_days'], inplace=True)

df.head()


In [None]:
# print the column names 
print("Columns in the DataFrame:")
for col in df.columns:
    print(col)

In [None]:
df.dtypes

## Testing for complete separation

In [None]:
covariates = [
    'requirement_type_floating-minor',
    'requirement_type_floating-patch',
    'requirement_type_fixed-ranging',
    'requirement_type_floating-major',
    #'requirement_type_pinning',
    'requirement_type_complex-expression',
    'requirement_type_at-most',
    'requirement_type_or-expression',
    'requirement_type_not-expression'
]

# Loop and print cross-tabs
for cov in covariates:
    print(f"\n=== Crosstab for: {cov} ===")
    ct = pd.crosstab(df[cov], df['is_out_of_date'])
    print(ct)

## Model 1: is_out_of_date

In [None]:
from lifelines import CoxTimeVaryingFitter
# Select a subset of variables
cols_to_keep = ['dependency_id', 'interval_start_days', 'interval_end_days', 'is_out_of_date',
    'requirement_type_floating-major',
    'requirement_type_floating-minor',
    'requirement_type_floating-patch',
    'requirement_type_fixed-ranging',
    'requirement_type_complex-expression',
    'requirement_type_at-most',
    'requirement_type_or-expression',
    'requirement_type_not-expression'
]

df_model_vul = df[cols_to_keep]

ctv = CoxTimeVaryingFitter()
ctv.fit(df_model_vul, id_col="dependency_id", event_col="is_out_of_date", 
       start_col="interval_start_days", stop_col="interval_end_days",
       show_progress=True)
ctv.print_summary()

# Plot and set title
ax = ctv.plot()
ax.set_title("Cox Time-Varying Coefficients: Vulnerable Dependencies", fontsize=14)
plt.show()

# **Model 2: is_exposed**

## Testing for separation

In [None]:
covariates = [
    'requirement_type_floating-minor',
    'requirement_type_floating-patch',
    'requirement_type_fixed-ranging',
    'requirement_type_floating-major',
    #'requirement_type_pinning',
    'requirement_type_complex-expression',
    'requirement_type_at-most',
    'requirement_type_or-expression',
    'requirement_type_not-expression'
]

# Loop and print cross-tabs
for cov in covariates:
    print(f"\n=== Crosstab for: {cov} ===")
    ct = pd.crosstab(df[cov], df['is_exposed'])
    print(ct)

## Testing at-most as a single predictor for is_exposed - causing model separation

In [None]:
cols_to_keep = ['dependency_id', 'interval_start_days', 'interval_end_days', 'is_exposed',
   # 'requirement_type_floating-major',
    'requirement_type_floating-minor',
    'requirement_type_floating-patch',
    'requirement_type_fixed-ranging',
    'requirement_type_complex-expression',
    'requirement_type_at-most',
   # 'requirement_type_or-expression',
   # 'requirement_type_not-expression'
]

df_model_vuln = df[cols_to_keep]

test_cols = ['dependency_id', 'interval_start_days', 'interval_end_days', 'is_exposed', 'requirement_type_at-most']
df_test = df_model_vuln[test_cols]

ctv_test = CoxTimeVaryingFitter()
try:
    ctv_test.fit(df_test, id_col="dependency_id", event_col="is_exposed",
                start_col="interval_start_days", stop_col="interval_end_days",
                show_progress=True)
    print("At-most works alone")
    ctv_test.print_summary()
except Exception as e:
    print(f"At-most fails alone: {e}")

## Showning potential quasi separation graphically

In [None]:
cols_to_keep = ['dependency_id', 'interval_start_days', 'interval_end_days', 'is_exposed',
   # 'requirement_type_floating-major',
    'requirement_type_floating-minor',
    'requirement_type_floating-patch',
    'requirement_type_fixed-ranging',
    'requirement_type_complex-expression',
    'requirement_type_at-most',
   # 'requirement_type_or-expression',
   # 'requirement_type_not-expression'
]

df_model_vuln = df[cols_to_keep]
# Check if at-most subjects cluster at specific time points
at_most_events = df_model_vuln[
    (df_model_vuln['requirement_type_at-most'] == 1) & 
    (df_model_vuln['is_exposed'] == 1)
]

print("At-most event timing distribution:")
print(at_most_events['interval_end_days'].describe())

# Check for clustering
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

# Plot all events
all_events = df_model_vuln[df_model_vuln['is_exposed'] == 1]
ax1.hist(all_events['interval_end_days'], bins=50, alpha=0.7, label='All events')
ax1.set_title('All Events Over Time')
ax1.set_xlabel('Days')

# Plot at-most events
ax2.hist(at_most_events['interval_end_days'], bins=20, alpha=0.7, color='red')
ax2.set_title('At-Most Events Over Time')
ax2.set_xlabel('Days')

plt.tight_layout()
plt.show()

# Check for exact time clustering
print("\nMost common event times for at-most:")
print(at_most_events['interval_end_days'].value_counts().head(10))

In [None]:
cols_to_keep = ['dependency_id', 'interval_start_days', 'interval_end_days', 'is_exposed',
   # 'requirement_type_floating-major',
    'requirement_type_floating-minor',
    'requirement_type_floating-patch',
    'requirement_type_fixed-ranging',
    'requirement_type_complex-expression',
   # 'requirement_type_at-most',
   # 'requirement_type_or-expression',
   # 'requirement_type_not-expression'
]

df_model_outdated = df[cols_to_keep]


ctv = CoxTimeVaryingFitter()
ctv.fit(df_model_outdated, id_col="dependency_id", event_col="is_exposed", 
       start_col="interval_start_days", stop_col="interval_end_days",
       show_progress=True)
ctv.print_summary()

# Plot and set title
ax = ctv.plot()
ax.set_title("Cox Time-Varying Coefficients: Vulnerable Dependencies", fontsize=14)
plt.show()

## dependency outdated and vulnerability rates 

In [None]:
# Total number of unique dependencies
total_dependencies = df['dependency_id'].nunique()

# Number of dependencies ever exposed
exposed_dependencies = df[df['is_exposed'] == True]['dependency_id'].nunique()

# Number of dependencies ever outdated
outdated_dependencies = df[df['is_out_of_date'] == True]['dependency_id'].nunique()

# Percentages
exposure_rate = (exposed_dependencies / total_dependencies) * 100
outdated_rate = (outdated_dependencies / total_dependencies) * 100

# Output
print(f"Total unique dependencies: {total_dependencies}")
print(f"Dependencies ever exposed: {exposed_dependencies} ({exposure_rate:.2f}%)")
print(f"Dependencies ever outdated: {outdated_dependencies} ({outdated_rate:.2f}%)")