In [1]:
import numpy as np
import pandas as pd
from pandas import Series
from sklearn.preprocessing import OneHotEncoder
from pymsm.multi_state_competing_risks_model import PathObject, MultiStateModel


def get_categorical_columns(df, cat_cols):
    encoder = OneHotEncoder(drop='first', sparse=False)
    new_df = pd.DataFrame(encoder.fit_transform(df[cat_cols]), dtype=int)
    new_df. columns = encoder.get_feature_names_out(cat_cols)
    return new_df

In [14]:
longdata = pd.read_csv('msebmt.csv', index_col=0)
longdata = longdata[longdata['status']==1].reset_index(drop=True)
longdata.head(10)

Unnamed: 0,id,from,to,trans,Tstart,Tstop,time,status,match,proph,year,agecl
0,1,1,2,1,0.0,22.0,22.0,1,no gender mismatch,no,1995-1998,20-40
1,2,1,3,2,0.0,12.0,12.0,1,no gender mismatch,no,1995-1998,20-40
2,2,3,4,8,12.0,29.0,17.0,1,no gender mismatch,no,1995-1998,20-40
3,2,4,5,11,29.0,422.0,393.0,1,no gender mismatch,no,1995-1998,20-40
4,3,1,3,2,0.0,27.0,27.0,1,no gender mismatch,no,1995-1998,20-40
5,4,1,3,2,0.0,42.0,42.0,1,gender mismatch,no,1995-1998,20-40
6,4,3,4,8,42.0,50.0,8.0,1,gender mismatch,no,1995-1998,20-40
7,4,4,5,11,50.0,84.0,34.0,1,gender mismatch,no,1995-1998,20-40
8,5,1,2,1,0.0,22.0,22.0,1,gender mismatch,no,1995-1998,>40
9,5,2,5,6,22.0,114.0,92.0,1,gender mismatch,no,1995-1998,>40


In [3]:
# Categorical columns
cat_cols = ['match', 'proph','year', 'agecl']
cat_df = get_categorical_columns(longdata, cat_cols)
covariate_cols = cat_df.columns
data = pd.concat([longdata.drop(cat_cols, axis=1), cat_df], axis=1)
data

Unnamed: 0,id,from,to,trans,Tstart,Tstop,time,status,match_no gender mismatch,proph_yes,year_1990-1994,year_1995-1998,agecl_<=20,agecl_>40
0,1,1,2,1,0.0,22.0,22.0,1,1,0,0,1,0,0
1,2,1,3,2,0.0,12.0,12.0,1,1,0,0,1,0,0
2,2,3,4,8,12.0,29.0,17.0,1,1,0,0,1,0,0
3,2,4,5,11,29.0,422.0,393.0,1,1,0,0,1,0,0
4,3,1,3,2,0.0,27.0,27.0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3250,2277,1,3,2,0.0,8.0,8.0,1,1,0,1,0,1,0
3251,2277,3,4,8,8.0,18.0,10.0,1,1,0,1,0,1,0
3252,2278,1,2,1,0.0,15.0,15.0,1,1,0,0,0,1,0
3253,2279,1,2,1,0.0,18.0,18.0,1,0,0,0,1,1,0


In [40]:
data

Unnamed: 0,id,from,to,trans,Tstart,Tstop,time,status,match_no gender mismatch,proph_yes,year_1990-1994,year_1995-1998,agecl_<=20,agecl_>40
0,1,1,2,1,0.0,22.0,22.0,1,1,0,0,1,0,0
1,2,1,3,2,0.0,12.0,12.0,1,1,0,0,1,0,0
2,2,3,4,8,12.0,29.0,17.0,1,1,0,0,1,0,0
3,2,4,5,11,29.0,422.0,393.0,1,1,0,0,1,0,0
4,3,1,3,2,0.0,27.0,27.0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3250,2277,1,3,2,0.0,8.0,8.0,1,1,0,1,0,1,0
3251,2277,3,4,8,8.0,18.0,10.0,1,1,0,1,0,1,0
3252,2278,1,2,1,0.0,15.0,15.0,1,1,0,0,0,1,0
3253,2279,1,2,1,0.0,18.0,18.0,1,0,0,0,1,1,0


In [46]:
def default_update_covariates_function(covariates_entering_origin_state, origin_state=None, target_state=None,
                                       time_at_origin=None, abs_time_entry_to_target_state=None):
    return covariates_entering_origin_state


terminal_states = [5,6]

In [47]:
dataset = []
final_states = []

for sample_id in data.id.unique():
    sample_df = data[data['id']==sample_id]
    # add covariates
    path = PathObject(
        covariates=(sample_df.iloc[0][covariate_cols]),
        sample_id=sample_id,)

    # add transitions
    for i, row in sample_df.iterrows():
        path.states.append(row['from'].astype(int)) 
        path.time_at_each_state.append(row['time'])
    # append final state
    final_state = row['to'].astype(int)
    if final_state in terminal_states:
        path.states.append(final_state)
        final_states.append(final_state)
        dataset.append(path)

    

print(type(path))
print('\n------covariates------')
print(path.covariates)
print('\n-------states---------')
print(path.states)
print('\n--time at each state--')
print(path.time_at_each_state)
print('\n------sample id-------')
print(path.sample_id)

<class 'pymsm.multi_state_competing_risks_model.PathObject'>

------covariates------
match_no gender mismatch    0.0
proph_yes                   0.0
year_1990-1994              0.0
year_1995-1998              1.0
agecl_<=20                  1.0
agecl_>40                   0.0
Name: 3253, dtype: float64

-------states---------
[1, 2]

--time at each state--
[18.0, 12.0]

------sample id-------
2279


In [48]:
np.unique(np.array(final_states), return_counts=True)

(array([5, 6]), array([370, 533]))

In [49]:
from pymsm.multi_state_competing_risks_model import MultiStateModel
multi_state_model = MultiStateModel(dataset, terminal_states, default_update_covariates_function, covariate_cols)

In [50]:
multi_state_model.fit()

Fitting Model at State: 1
>>> Fitting Transition to State: 3, n events: 404
>>> Fitting Transition to State: 2, n events: 244
>>> Fitting Transition to State: 5, n events: 95
>>> Fitting Transition to State: 6, n events: 160
Fitting Model at State: 3
>>> Fitting Transition to State: 4, n events: 151
>>> Fitting Transition to State: 6, n events: 197
>>> Fitting Transition to State: 5, n events: 56
Fitting Model at State: 4
>>> Fitting Transition to State: 5, n events: 107
>>> Fitting Transition to State: 6, n events: 137
Fitting Model at State: 2
>>> Fitting Transition to State: 5, n events: 112
>>> Fitting Transition to State: 6, n events: 39
>>> Fitting Transition to State: 4, n events: 93


In [51]:
# WORKS!