In [None]:
from lifelines import CoxTimeVaryingFitter
import pandas as pd
import copy

## Data Import

In [2]:
df = pd.read_csv('../../data/relationships/relations_minified_versioning.csv')


In [3]:
df.head()

Unnamed: 0,dependency_id,interval_start_days,interval_end_days,requirement_type,is_out_of_date,is_exposed
0,313,2584,2734,floating - minor,f,f
1,832528,554,601,floating - minor,f,f
2,832528,258,274,floating - minor,f,f
3,832528,833,849,floating - minor,f,f
4,3039,847,848,floating - major,f,f


In [4]:
df.columns

Index(['dependency_id', 'interval_start_days', 'interval_end_days',
       'requirement_type', 'is_out_of_date', 'is_exposed'],
      dtype='object')

In [5]:
df.dtypes

dependency_id           int64
interval_start_days     int64
interval_end_days      object
requirement_type       object
is_out_of_date         object
is_exposed             object
dtype: object

## Data Pre Processing

In [6]:
# Convert to numeric, rows with non-numeric values will become NaN
df['interval_end_days'] = pd.to_numeric(df['interval_end_days'], errors='coerce')
# Drop rows where conversion failed (NaN values)
df = df.dropna(subset=['interval_end_days'])

# Convert to numeric, rows with non-numeric values will become NaN
df['interval_start_days'] = pd.to_numeric(df['interval_start_days'], errors='coerce')
# Drop rows where conversion failed (NaN values)
df = df.dropna(subset=['interval_start_days'])

In [7]:
df['is_out_of_date'] = df['is_out_of_date'].map({'t': True, 'f': False})
df['is_exposed'] = df['is_exposed'].map({'t': True, 'f': False})

In [8]:
df.dtypes

dependency_id            int64
interval_start_days      int64
interval_end_days      float64
requirement_type        object
is_out_of_date            bool
is_exposed                bool
dtype: object

## Statistical Difference Test

In [9]:
df.dtypes

dependency_id            int64
interval_start_days      int64
interval_end_days      float64
requirement_type        object
is_out_of_date            bool
is_exposed                bool
dtype: object

In [10]:
from scipy.stats import chi2_contingency, mannwhitneyu, kruskal
import seaborn as sns
import matplotlib.pyplot as plt

# Compute duration for non-parametric tests
df['duration'] = df['interval_end_days'] - df['interval_start_days']
df = df[df['duration'] > 0]  # Filter invalid durations

# ============================
# 1. Chi-Square Test
# ============================

print("\n🔢 Chi-Square Test of Independence (Requirement Type vs Outdated):")
ct_outdated = pd.crosstab(df['requirement_type'], df['is_out_of_date'])
chi2_outdated, p_outdated, _, _ = chi2_contingency(ct_outdated)
print(ct_outdated)
print(f"Chi² = {chi2_outdated:.2f}, p = {p_outdated:.4f}")

print("\n🔢 Chi-Square Test of Independence (Requirement Type vs Vulnerability):")
ct_exposed = pd.crosstab(df['requirement_type'], df['is_exposed'])
chi2_exposed, p_exposed, _, _ = chi2_contingency(ct_exposed)
print(ct_exposed)
print(f"Chi² = {chi2_exposed:.2f}, p = {p_exposed:.4f}")


🔢 Chi-Square Test of Independence (Requirement Type vs Outdated):
is_out_of_date                   False   True 
requirement_type                              
floating - major                240694    1150
floating - major - restrictive   12764    2908
floating - minor                522566  121691
floating - patch                 55498   21417
other                           107618   17392
pinned                          128630   55134
Chi² = 77496.76, p = 0.0000

🔢 Chi-Square Test of Independence (Requirement Type vs Vulnerability):
is_exposed                       False  True 
requirement_type                             
floating - major                241833     11
floating - major - restrictive   15615     57
floating - minor                643576    681
floating - patch                 76637    278
other                           124512    498
pinned                          183358    406
Chi² = 1280.12, p = 0.0000


## One hot encoding for categorical data

In [None]:
# Check your original column
print(df['requirement_type'].unique())

# # Option 1: Clean strings if they have extra whitespace
# df['requirement_type'] = df['requirement_type'].str.strip()

# Option 2: One-hot encode the requirement_type column
df['requirement_type'] = pd.Categorical(
    df['requirement_type'],
    categories=[
        'floating - major', # This is the baseline category for one-hot encoding
        'floating - major - restrictive',
        'floating - minor',
        'floating - patch',
        'pinned',
        'other'
    ],
    ordered=True
)
df = pd.get_dummies(df, columns=['requirement_type'], drop_first=True)

['floating - minor' 'floating - major' 'pinned' 'other' 'floating - patch'
 'floating - major - restrictive']


In [12]:
## Problem: deaths at time zero not allowed
# Cox models require that the start time is less than the end time.
# Solution: Add a small value to the end time
# to avoid zero duration intervals.

# Avoid 0-duration intervals where event == 1
mask = (df['interval_start_days'] == df['interval_end_days']) & (df['interval_end_days'] == 0) & (df['is_out_of_date'])
df.loc[mask, 'interval_end_days'] = 0.5  # or 1.0 if using days

# Optional: ensure all stop > start
df = df[df['interval_end_days'] > df['interval_start_days']]

In [13]:
# Sorting has effect on the model since the model expects the data for each individual together
df.sort_values(by=['dependency_id', 'interval_start_days', 'interval_end_days'], inplace=True)

## Survival Analysis

In [14]:
df_copy = pd.DataFrame(copy.deepcopy(df.values), columns=df.columns)
ctv = CoxTimeVaryingFitter()
ctv.fit(df_copy, id_col='dependency_id', start_col='interval_start_days', stop_col='interval_end_days', event_col='is_out_of_date')
ctv.print_summary()


>>> events = df['is_out_of_date'].astype(bool)
>>> print(df.loc[events, 'is_exposed'].var())
>>> print(df.loc[~events, 'is_exposed'].var())

A very low variance means that the column is_exposed completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



0,1
model,lifelines.CoxTimeVaryingFitter
event col,'is_out_of_date'
number of subjects,17504
number of periods,1287462
number of events,219692
partial log-likelihood,-1765961.78
time fit was run,2025-04-21 21:49:11 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
is_exposed,1.58,4.86,0.02,1.54,1.63,4.65,5.09,0.0,68.85,<0.005,inf
duration,-0.06,0.94,0.0,-0.06,-0.06,0.94,0.94,0.0,-321.79,<0.005,inf
requirement_type_floating - major - restrictive,3.53,34.26,0.03,3.47,3.6,31.99,36.68,0.0,101.43,<0.005,inf
requirement_type_floating - minor,3.68,39.78,0.03,3.63,3.74,37.53,42.16,0.0,124.29,<0.005,inf
requirement_type_floating - patch,4.01,55.28,0.03,3.95,4.07,52.09,58.66,0.0,132.5,<0.005,inf
requirement_type_pinned,4.14,62.89,0.03,4.08,4.2,59.33,66.68,0.0,138.93,<0.005,inf
requirement_type_other,3.4,29.83,0.03,3.34,3.46,28.1,31.66,0.0,111.44,<0.005,inf

0,1
Partial AIC,3531937.57
log-likelihood ratio test,410630.37 on 7 df
-log2(p) of ll-ratio test,inf


In [15]:
df_copy = pd.DataFrame(copy.deepcopy(df.values), columns=df.columns)
ctv = CoxTimeVaryingFitter()
ctv.fit(df_copy, id_col='dependency_id', start_col='interval_start_days', stop_col='interval_end_days', event_col='is_exposed')
ctv.print_summary()


>>> events = df['is_exposed'].astype(bool)
>>> print(df.loc[events, 'is_out_of_date'].var())
>>> print(df.loc[~events, 'is_out_of_date'].var())

A very low variance means that the column is_out_of_date completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



0,1
model,lifelines.CoxTimeVaryingFitter
event col,'is_exposed'
number of subjects,17504
number of periods,1287462
number of events,1931
partial log-likelihood,-12298.95
time fit was run,2025-04-21 22:54:48 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
is_out_of_date,28.44,2250000000000.0,9089.51,-17786.67,17843.55,0.0,inf,0.0,0.0,1.00,0.0
duration,-0.04,0.96,0.0,-0.05,-0.04,0.95,0.96,0.0,-26.87,<0.005,526.04
requirement_type_floating - major - restrictive,0.78,2.18,0.33,0.13,1.43,1.14,4.16,0.0,2.36,0.02,5.78
requirement_type_floating - minor,-0.38,0.68,0.3,-0.98,0.21,0.38,1.24,0.0,-1.26,0.21,2.27
requirement_type_floating - patch,0.35,1.42,0.31,-0.26,0.95,0.77,2.59,0.0,1.13,0.26,1.95
requirement_type_pinned,-0.04,0.96,0.31,-0.64,0.56,0.53,1.74,0.0,-0.14,0.89,0.17
requirement_type_other,1.33,3.79,0.31,0.74,1.93,2.09,6.9,0.0,4.37,<0.005,16.29

0,1
Partial AIC,24611.90
log-likelihood ratio test,10030.91 on 7 df
-log2(p) of ll-ratio test,inf
