In [None]:
# Code for data analysis of article: 
# Teclistamab for Relapsed/Refractory Multiple Myeloma, real-World experience

# Import pandas to utilize dataframes and to read the xlsx files
import pandas as pd

# Import numpy to utilize mathematical functions
import numpy as np

# Import stats for statistical test like Kruskal walis & fisher exact test etc.
import scipy.stats as stats

# Import matplot for plotting
import matplotlib.pyplot as plt
        
# Import date time for working with date time 
import datetime as datetime

#To calculate Progression free survival
from lifelines import KaplanMeierFitter

# Import os to utilize the built in functionality like the current working directory
import os

# Find out the current working directory
#print(os.getcwd())


In [None]:
class LoadData():
    def __init__(self):
        # Class contructor(__init__ function) to initialize its attributes
        self.data = None # data of interest
        self.col_response = None #Column number containing responses
                                 #Detail of responses: 
                                 #        1=none or minimal response
                                 #        2=PR, 3=VGPR, 4 = nCR+CR 
        self.data_description=None #Description of the data 
    
    
    def loadfile(self, file_path,intr_columns, col_resp):
        """read the data from xlsx file

        Parameters
        ----------
         file_path: str
            A valid file path and file name contianing the data
         
         intr_columns: list
            A list of column names
         
         col_resp: int
                     Column number containing responses
                     Detail of responses: 
                     1=none or minimal response
                     2=PR, 3=VGPR, 4 = nCR+CR   
            
        Returns/Update
        --------------
            It updates the attributes, 
                            self.data
                            self.col_response
                            self.data_description
            
        """
       
        self.col_response = col_resp
        raw_data=pd.read_excel(file_path, sheet_name=2) #Read raw data from the xlsx file
        self.data = pd.DataFrame(raw_data, columns=intr_columns ) #Extract columns relevant for analysis

        self.data=self.data.iloc[0:self.data.loc[1].notnull().count()-2,
                                 :] #Only extract relevant rows those are not null
        self.data_description= self.data.describe() #Descriptive statistics of data



In [None]:
class ExtractData(LoadData): # Class ExtractData inherets the class LoadData
    def __init__(self):
        LoadData.__init__(self)
        
    
    def extract_continuous_data(self, col_intr, cat_low=None, cat_high=None):
        """Method to extract data of continuous variables to perform the
                    Kruskal walis or t-test. This is an overloaded method to take care:
                    1-Simple extraction: Extraction of data, based on two columns
                    2-Subcategorized data: Extraction of data, based on two columns and data
                      in these columns also have subcategories. 

        Parameters
        ----------
         col_intr: int
            A column number that is under analysis 
         cat_low: int
            First subcategory, will be used to overload the method.
         cat_high: int
            Second subcategory, will be used to overload the method.
         
            
        Returns/Update
        --------------
            It returns, 
                       cnt_data
        """
        
        #Extract data where response=1
        my_data_1= self.data[ (self.data.iloc[:,self.col_response]==1)  ]
       
        #Extract data where response > 1
        my_data_more_1= self.data[ (self.data.iloc[:,self.col_response]>1)]
            
        if ((cat_low is not None) & (cat_high is not None)):
            high_vals= (my_data_more_1[(my_data_more_1.iloc[:,col_intr]>=cat_low) & 
                         (my_data_more_1.iloc[:,col_intr]< cat_high)])

            print('Number of high category values are={} for column no ={}'.format(high_vals.iloc[:,col_intr].count()
                                                                                    , col_intr))
            low_vals=my_data_1[(my_data_1.iloc[:,col_intr]>=cat_low) & 
                         (my_data_1.iloc[:,col_intr]< cat_high)]
            print('Number of low category values are={} for column no ={}'.format(low_vals.iloc[:,col_intr].count()
                                                                                    , col_intr))
            #Return continious data based on drug response categories
            cnt_data = (low_vals.iloc[:,col_intr] , 
                          high_vals.iloc[:,col_intr])
            return(cnt_data)
        
        elif (cat_high is not None):
            high_vals=my_data_more_1[my_data_more_1.iloc[:,col_intr]>cat_high]
            print('Number of high category values are={} for column no ={}'.format(high_vals.iloc[:,col_intr].count()
                                                                                    , col_intr))
            
            low_vals=my_data_1[my_data_1.iloc[:,col_intr]>cat_high]
            print('Number of low category values are={} for column no ={}'.format(low_vals.iloc[:,col_intr].count()
                                                                                    , col_intr))
            
            #Return continious data based on drug response categories
            cnt_data = (low_vals.iloc[:,col_intr] , 
                          high_vals.iloc[:,col_intr])
            return(cnt_data)
        
        else:
            #Calculate the descriptive statistics of the argument col_intr(age) when response= 1=none or minimal response
            print('Descriptive statistics for column no = {} where response = {}, are:\n'.format(col_intr, '1') ,
                  pd.DataFrame(my_data_1.iloc[:,col_intr]).round(1).describe())
            
            #Calculate the descriptive statistics of the argument col_intr(age) when response> 1
            print('Descriptive statistics for column no = {} where response = {}, are:\n'.format(col_intr, '>1') ,
                  pd.DataFrame(my_data_more_1.iloc[:,col_intr]).round(1).describe())
            
            #Return continious data based on drug response categories
            cnt_data = (my_data_1.iloc[:,col_intr] , 
                          my_data_more_1.iloc[:,col_intr])
            return(cnt_data)               
        
        
                
    def make_contigency_table(self, col_intr, col_val ):
        """Method to make a contigency table to perform the fisher exact test 
           for categorical variables.
           
        Parameters
        ----------
         col_intr: int
            A column number that is under analysis 
         col_val: int
            Data codings in the column col_intr, for example 0=Male, 1=female
            
        Returns/Update
        --------------
            It returns, 
                       df_stat
        """


        lPR_male = self.data[ (self.data.iloc[:, self.col_response] == 1)
                            & (self.data.iloc[:, col_intr] == col_val) ]
   
        # Calculate the number of events when response is =1 and male
        if lPR_male.empty:
            lPR_male = 0
            print("No data found for Response = 1 and code = {}".format(col_val))
        else:
            lPR_male = lPR_male.iloc[:, col_intr].count()
            print("Number of records found for code = {} are = {}:" .format(col_val,lPR_male) )
        
        # Calculate the number of events when response is =1 and female
        lPR_female = self.data[ (self.data.iloc[:, self.col_response] == 1)
                               & (self.data.iloc[:, col_intr] != col_val) ]
        if lPR_female.empty:
            lPR_female = 0
            print("No data found for Response = 1 and code != {}".format(col_val))
        else:
            lPR_female = lPR_female.iloc[:,col_intr].count() 
            print("Number of records found for code != {} are = {}:" .format(col_val, lPR_female) )
                      
        # Subset the data to calaulate the number of events when response is >1 
        # (that means response was >=PR) in the interested column (col_intr) for a data code
        # (col_val)
        gPR_male = self.data[ (self.data.iloc[:, self.col_response] > 1)
                            & (self.data.iloc[:, col_intr] == col_val) ]

        # Calculate the number of events when response is >1 and male
        if gPR_male.empty:
            gPR_male=0
            print("No data found for Response > 1 and code = {}".format(col_val)) 
        else:
            gPR_male = gPR_male.iloc[:,col_intr].count()
            print("Number of records found for code = {} are {}:" .format(col_val, gPR_male) )

        # Calculate the number of events when response is >1 and female
        gPR_female = self.data[ (self.data.iloc[:,self.col_response] > 1)
                               & (self.data.iloc[:,col_intr] != col_val)]
        if gPR_female.empty:
            gPR_female = 0
            print("No data found for Response = 1 and code != {}".format(col_val))
        else:
            gPR_female = gPR_female.iloc[:,col_intr].count() 
            print("Number of records found for code != {} are = {}:" .format(col_val, gPR_female) )
   
        #Return contigency table
        df_stat = pd.DataFrame({'lPR':[ lPR_male, lPR_female ],
                                'gPR':[ gPR_male, gPR_female ]})
        return(df_stat)


In [None]:
class ProgressionFreeSurvival(LoadData): # Class ProgressionFreeSurvival inherets the class LoadData
    def __init__(self):
        LoadData.__init__(self)

    def PFS(self, study_end_date):
        """Method to calculate the progression free survival
           
        Parameters
        ----------
         study_end_date: a date in the form of str '2022-06-30'
            Study end date 
            
        Returns/Update
        --------------
            It prints, 
                       1- Progression free survival graph
                       2- The median survival time
                       3- Progression free survival timeline
        """

        #Calculate the difference in the months
        pfs_months=( self.data['Progress, date'] - 
           self.data['Tec start date'])/np.timedelta64(1, 'M')


        events= [1] * len(pfs_months)
        for i,j  in enumerate(pfs_months):
            if pd.isna(j):
                #Calculate the difference in study_end_date and Tec start date in months
                pfs_months[i] = (datetime.datetime.strptime(study_end_date, '%Y-%m-%d') - 
                     (self.data['Tec start date'][i]))/np.timedelta64(1, 'M')
                events[i]=0
        
       
        ## create a kmf object
        kmf = KaplanMeierFitter()
        ## Fit the data into the model
        kmf.fit(pfs_months, events,label='Progression free survival')
        ## Calcutlate an estimate
        pfs_plt=kmf.plot(ci_show=True, color=(0.3, 0.5, 0.6, 0.85), at_risk_counts=True) 
        pfs_plt.set_xlabel("Months", fontdict = {'fontsize' : 12.5})
        pfs_plt.set_ylabel("Survival possibility", fontdict = {'fontsize' : 12.5})
        plt.show()
        #plt.savefig("Your_Path/test_today.jpg")

        
        #Print the median survival
        print("The median survival time:", kmf.median_survival_time_)
        #Print the Progression free survival timeline
        print("Progression free survival timeline:", kmf.survival_function_)



In [None]:
#MAIN of the code, #if __name__ == '__main__'
#For analysis of continuous and categorical variables

d_ext = ExtractData() # Create object of the class ExtractData that inherit LoadData
fl_path='/Users/muhkas/Desktop/HN/Katarina_Utt/Teclistamab/Tec_ytterligare_info2.xlsx'
intr_columns=['Age when starting Tec', 
              'Best response, minimal+non=1, PR=2, VGPR=3, nCR+CR=4',
              'Sex, m=0', 'ISS (at diagnose)','t(4;14), yes=1', 
              't(14;16), yes=1', 'del17p, yes=1', 'High Risk','ECOG',
              'eGFR', 'Date of diagnosis', 'Last follow up', 'Studynr',
              'Days in hospital', 'Progress, date', 'Tec start date',
              'First response, date', 'Best response, date','Last follow up'
             ]
#Load data
d_ext.loadfile(fl_path, intr_columns, 1)
print('Descriptive statistics of the data:\n', d_ext.data_description)

#Univariate analysis of continuous variable
ret_data=d_ext.extract_continuous_data(9, cat_low=30, cat_high=60)
print("\nStatistics for continuous variable:\n", stats.kruskal(ret_data[0], ret_data[1]))

#Analysis for categorical variable
ret_data=d_ext.make_contigency_table(col_intr = 2, col_val = 0)
print("\nStatistics for continuous variable:\n",stats.fisher_exact(table  = ret_data.to_numpy(),
                         alternative = 'two-sided')) # Since number of sample is small,
                                                     # therefore, only fisher exact test is used.
                                                     # Chi-square test was not used.




In [None]:
#MAIN of the code, #if __name__ == '__main__'
#For analysis of progression free survival
pfs=ProgressionFreeSurvival()
pfs.loadfile(fl_path, intr_columns, 1)
pfs.PFS('2022-06-30') #Call method PFS with end date of study as argument


### 