# Feature Processor

In this notebook, the feature engineering notebook is sorted and simplified for better readability and to do the short term CPET analysis

In [7]:
import matplotlib.pyplot as plt
import matplotlib.animation as anim
import matplotlib.animation
import numpy as np
import pandas as pd
import seaborn as sns
import math
from IPython.display import HTML
from scipy import stats
from scipy.signal import butter, lfilter
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

## Theoretical peak functions

In this section we will run the functions that generates the expected values for the patients

In [8]:
def get_vo2_peak(row):
    sex = 0
    bmi = 0
    age = row.age
    if row.sex == 'F':
        sex=2
    else:
        sex=1
    if row.BMI <= 25:
        bmi = 0
    else:
        bmi = 1
    if age < 34:
        age = 1
    elif 34 <= age < 44:
        age = 2
    elif 44 <= age < 54:
        age = 3
    elif 54 <= age < 64:
        age = 4
    elif age >=64:
        age = 5
    peak_est = 47.7565 -0.9880*age-0.2356*age**2-8.8697*sex+2.3597*bmi-2.0308*age*bmi-3.7405*sex*bmi+0.2512*age*sex+1.3797*age*sex*bmi
    peak_est = peak_est * 10**-3 * row['weight-kg']
    return peak_est

In [9]:
def get_o2_peak(row):
    sex = 0
    bmi = 0
    age = row.age
    if row.sex == 'F':
        sex=2
    else:
        sex=1
    if row.BMI <= 25:
        bmi = 0
    else:
        bmi = 1
    if age < 34:
        age = 1
    elif 34 <= age < 44:
        age = 2
    elif 44 <= age < 54:
        age = 3
    elif 54 <= age < 64:
        age = 4
    elif age >=64:
        age = 5
    peak_est = 22.1667 -0.8167*age+0.0167*age**2-5.8667*sex+4.8897*bmi-1.4230*age*bmi-2.4230*sex*bmi+0.3333*age*sex+0.7897*age*sex*bmi
    peak_est = peak_est * 10**-3 
    return peak_est

In [10]:
def max_hr_predicted(row):
    peak_hr = 208 - 0.7*row['age']
    return peak_hr

## Patient Info collection

The process is divided in two subparts, one for the patient information and the other for the data. The first thing to do is to collect the patient's information add their expected maximal values

In [11]:
file_info = './data/DigitalDataEnrichedUpdatedLabelsBJA.xlsx'

In [12]:
patient_info = pd.read_excel(file_info, sheet_name = 'Patient')
patient_info['BMI']=patient_info['weight-kg']/(patient_info['height-cm']/100)**2
patient_info.dropna(subset = ["sex"], inplace=True)
patient_info['MaxVO2_EST']=patient_info.apply(lambda x: get_vo2_peak(x),axis=1)
patient_info['MaxO2_EST']=patient_info.apply(lambda x: get_o2_peak(x),axis=1)
patient_info['PredictedMaxHR']=patient_info.apply(lambda x: max_hr_predicted(x),axis=1)
patient_info.head()

Unnamed: 0,patientid,sex,age,height-cm,weight-kg,BMI,TypeExercise,TypePatient,FileName,Condition,...,CardiacLim,Observations,BA-PrimaryPulmonaryLim,BA-PrimaryCardiacLim,OtherPrimaryLim,Healthy,Observation,MaxVO2_EST,MaxO2_EST,PredictedMaxHR
0,7,F,40,162.56,94.9091,35.915381,Treadmill,Unknown,CHF.1.xlsx,OK,...,1,,0,1,0,0,0,2.319521,0.010556,180.0
1,8,M,69,175.26,107.6818,35.057157,Treadmill,Unknown,CHF.2.xlsx,OK,...,1,,0,1,0,0,0,2.65721,0.013601,159.7
2,9,F,16,160.528,49.31,19.135218,Treadmill,YoungAthlete,DEAS.1.xlsx,OK,...,0,,0,0,0,1,0,1.444581,0.0103,196.8
3,10,M,17,176.784,60.5,19.3584,Treadmill,YoungAthlete,DEAS.2.xlsx,OK,...,0,,0,0,0,1,0,2.293821,0.015833,196.1
4,11,F,15,179.07,66.0,20.582507,Treadmill,YoungAthlete,DEAS.3.xlsx,OK,...,0,,0,0,0,1,0,1.933529,0.0103,197.5


## 30s sampling

The data is going to be sampled in 30s windows so can noise be removed and their features can be captured with ease. It may induce some errors but they are negligible

In [13]:
def round_to_30s(row):
    base = np.floor(row.minutes)
    med = base +0.5
    sup = np.round(row.minutes)
    if row.minutes <= med:
        return med
    else:
        return sup
    pass

Reading the CPET data

In [14]:
data = pd.read_excel(file_info, sheet_name = 'CPET')
data_filt=data[['PatientId','Time','TestLevel','HR','VO2','VO2/kg','VCO2','VE','VE/VO2','VE/VCO2','FE02','FECO2','RER','RR','METS','Source','TypeUser','Summary']]

Transforming the time in the minute scale, this is a decimal scale, thus 00:06:30 would be 6.5

In [15]:
minutes = []
for i in data_filt.Time:
    minutes.append(i.second/60+i.minute)
data['minutes']=np.array(minutes)

Merging the data

In [16]:
data= pd.merge(data, patient_info, left_on='PatientId', right_on='patientid', how='left')
data_filtered = data[['PatientId','SessionId','minutes','Time','TestLevel','HR','VO2','VCO2','VE','VE/VO2','VE/VCO2','RER','RR','MaxVO2_EST','MaxO2_EST','Condition','sex','age','BMI','BA-PrimaryCardiacLim','BA-PrimaryPulmonaryLim','OtherPrimaryLim','Healthy']]

Creating the data with the max time within the 30s frame. If th max time is 8.90 then it is 9 in the scale

In [17]:
patient_times = data_filtered.groupby(['PatientId','SessionId'])['minutes'].max().reset_index()
patient_times['max_time'] = patient_times.apply(round_to_30s, axis=1)
patient_times = patient_times.loc[patient_times.PatientId >= 7]
patient_times.head()

Unnamed: 0,PatientId,SessionId,minutes,max_time
6,7,7.0,15.75,16.0
7,8,8.0,13.7,14.0
8,9,9.0,18.616667,19.0
9,10,10.0,18.633333,19.0
10,11,11.0,13.683333,14.0


Creating the 30s windowed dataframe

In [18]:
df_info_avg = pd.DataFrame(columns = ['PatientId','SessionId','minutes'])
for i in patient_times.SessionId:
    patient_time_min = np.arange(0.5,patient_times.loc[(patient_times.PatientId==np.floor(i)) & 
                                                       (patient_times.SessionId==i)]['max_time'].values[0],.5)
    patient_ids=np.ones(patient_time_min.shape)*np.floor(i)
    patient_session_ids = np.ones(patient_time_min.shape)*i
    tempdf = pd.DataFrame({'PatientId': patient_ids, 'SessionId':patient_session_ids, 'minutes': patient_time_min })
    df_info_avg=df_info_avg.append(tempdf, ignore_index=True)
    pass
df_info_avg.head()

Unnamed: 0,PatientId,SessionId,minutes
0,7.0,7.0,0.5
1,7.0,7.0,1.0
2,7.0,7.0,1.5
3,7.0,7.0,2.0
4,7.0,7.0,2.5


Generating the 30s windowed dataset. To do it, we relly in the following functions

In [19]:
# Round the values within the 30s window
def round_to_mean(column, row, df):
    return np.mean(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) &
                   (df.minutes > (row.minutes-0.5)) &
                   (df.minutes <= (row.minutes))][column].values)
# Function that does the left join without shenaningans
def place_label_fake_join(column, row, df):
    return df.loc[(df.patientid == row.PatientId)][column].values[0]
# Superficial function to detect anaerobic threshold
def has_anaerobic_threshold(row, df):
    return np.max(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['RER'].values) >= 1

Filtering patients because the patients before the Id 7 have no diagnostic information

In [20]:
df_info_avg = df_info_avg.loc[df_info_avg.PatientId>= 7]

In [21]:
df_info_avg['HR']=np.round(df_info_avg.apply(lambda x: round_to_mean('HR',x,data_filtered),axis=1),0)
df_info_avg['VO2']=df_info_avg.apply(lambda x: round_to_mean('VO2',x,data_filtered),axis=1)
df_info_avg['VCO2']=df_info_avg.apply(lambda x: round_to_mean('VCO2',x,data_filtered),axis=1)
df_info_avg['VE']=df_info_avg.apply(lambda x: round_to_mean('VE',x,data_filtered),axis=1)
df_info_avg['VE/VO2']=df_info_avg.apply(lambda x: round_to_mean('VE/VO2',x,data_filtered),axis=1)
df_info_avg['VE/VCO2']=df_info_avg.apply(lambda x: round_to_mean('VE/VCO2',x,data_filtered),axis=1)
df_info_avg['RER']=df_info_avg.apply(lambda x: round_to_mean('RER',x,data_filtered),axis=1)
df_info_avg['RR']=df_info_avg.apply(lambda x: round_to_mean('RR',x,data_filtered),axis=1)
df_info_avg['HasAnaerobicThreshold']=df_info_avg.apply(lambda x: has_anaerobic_threshold(x,data_filtered),axis=1)
df_info_avg['sex']=df_info_avg.apply(lambda x: place_label_fake_join('sex',x,patient_info),axis=1)
df_info_avg['age']=df_info_avg.apply(lambda x: place_label_fake_join('age',x,patient_info),axis=1)
df_info_avg['BMI']=df_info_avg.apply(lambda x: place_label_fake_join('BMI',x,patient_info),axis=1)
df_info_avg['MaxVO2_EST']=df_info_avg.apply(lambda x: place_label_fake_join('MaxVO2_EST',x,patient_info),axis=1)
df_info_avg['MaxO2_EST']=df_info_avg.apply(lambda x: place_label_fake_join('MaxO2_EST',x,patient_info),axis=1)
df_info_avg['PredictedMaxHR']=df_info_avg.apply(lambda x: place_label_fake_join('PredictedMaxHR',x,patient_info),axis=1)
df_info_avg['CardiacLim']=df_info_avg.apply(lambda x: place_label_fake_join('BA-PrimaryCardiacLim',x,patient_info),axis=1)
df_info_avg['PulmonaryLim']=df_info_avg.apply(lambda x: place_label_fake_join('BA-PrimaryPulmonaryLim',x,patient_info),axis=1)
df_info_avg['MuscleSkeletalLim']=df_info_avg.apply(lambda x: place_label_fake_join('OtherPrimaryLim',x,patient_info),axis=1)
df_info_avg['Healthy']=df_info_avg.apply(lambda x: place_label_fake_join('Healthy',x,patient_info),axis=1)
df_info_avg['O2']=df_info_avg.VO2/df_info_avg.HR*1000

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **k

Filtering the data that contains abnormalities and no HR

In [22]:
df_info_avg = df_info_avg[df_info_avg.HR != 0]
df_info_avg = df_info_avg[df_info_avg.PatientId != 217]
df_info_avg = df_info_avg[df_info_avg.PatientId != 110]
df_info_avg = df_info_avg[df_info_avg.PatientId != 111]
df_info_avg = df_info_avg[df_info_avg.PatientId != 112]
df_info_avg = df_info_avg[df_info_avg.PatientId != 113]
df_info_avg = df_info_avg[df_info_avg.PatientId != 147]
df_info_avg = df_info_avg[df_info_avg.PatientId != 179]
df_info_avg = df_info_avg[df_info_avg.PatientId != 182]

## Feature's generation

Once the patient's expected values are collected and the 30s windowed values are generated, the next step is to generate the features. This section will contain two main functions, one for the full feature and the other for the early data detection

### Functions' section
This is the section where the functions are created for the feature generation. The final process calls different functions. In this section most of them are described and grouped

#### Basic functions

In [23]:
# Basic functions where the maximum, minimum, mean and std values from the dataset are obtained
def get_lowest_variable(column, row, df):
    return np.min(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][column].values)

def get_highest_variable(column, row, df):
    return np.max(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][column].values)

def get_mean_variable(column, row, df):
    return np.mean(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][column].values)

def get_std_variable(column, row, df):
    return np.std(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][column].values)

def get_median_variable(column, row, df):
    return np.median(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][column].values)

#### HR functions

In [24]:
#HR complex functions, this are the HR agains the max expected HR
def get_HR_percent(row,df):
    max_hr_real = np.max(df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['HR'].values)
    max_hr_expected = row.PredictedMaxHR
    return max_hr_real/max_hr_expected
def get_HR_diff(row, df):
    max_hr_real = np.max(df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['HR'].values)
    max_hr_expected = row.PredictedMaxHR
    return max_hr_real-max_hr_expected
#Deprecated
def get_peak_heart_rate(column, row, df):
    return np.max(df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][column].values)

#### VO2 functions

In [25]:
#VO2 vs expected VO2 functions
def get_VO2_percent(row,df):
    max_vo2_real = np.max(df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['VO2'].values)
    max_vo2_expected = row.MaxVO2_EST
    return max_vo2_real/max_vo2_expected

def get_MaxVO2_expected_vs_real(row, df):
    max_vo2_real = np.max(df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['VO2'].values)
    max_vo2_expected = row.MaxVO2_EST
    return max_vo2_real-max_vo2_expected

#### Against VO2 slope functions

In [26]:
# Slope against VO2 functions
def get_HR_VO2_slope(row,df):
    lin_reg = LinearRegression()
    y_val = df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['HR'].values
    X_val = df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['VO2'].values.reshape(-1, 1)
    lin_reg.fit(X_val,y_val)
    return lin_reg.coef_[0]

def get_VE_VCO2_slope(row,df):
    lin_reg = LinearRegression()
    y_val = df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['VE'].values
    X_val = df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['VCO2'].values.reshape(-1, 1)
    lin_reg.fit(X_val,y_val)
    return lin_reg.coef_[0]

#### Basic O2 Pulse functions

In [27]:
def get_mean_O2_pulse(row, df):
    hr_values = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['HR'].values
    zero_ind = np.where(hr_values == 0)[0]
    if len(hr_values) == len(zero_ind):
        return -1
    vo2_values =df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['VO2'].values
    hr_values = np.delete(hr_values, zero_ind)
    vo2_values = np.delete(vo2_values, zero_ind)
    O2_pulse = vo2_values/hr_values
    return np.mean(O2_pulse)

def get_max_O2_pulse(row, df):
    hr_values = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['HR'].values
    zero_ind = np.where(hr_values == 0)[0]
    if len(hr_values) == len(zero_ind):
        return -1
    vo2_values =df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['VO2'].values
    hr_values = np.delete(hr_values, zero_ind)
    vo2_values = np.delete(vo2_values, zero_ind)
    O2_pulse = vo2_values/hr_values
    return np.max(O2_pulse)

def get_min_O2_pulse(row, df):
    hr_values = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['HR'].values
    zero_ind = np.where(hr_values == 0)[0]
    if len(hr_values) == len(zero_ind):
        return -1
    vo2_values =df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['VO2'].values
    hr_values = np.delete(hr_values, zero_ind)
    vo2_values = np.delete(vo2_values, zero_ind)
    O2_pulse = vo2_values/hr_values
    return np.min(O2_pulse)

def get_std_O2_pulse(row, df):
    hr_values = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['HR'].values
    zero_ind = np.where(hr_values == 0)[0]
    if len(hr_values) == len(zero_ind):
        return -1
    vo2_values =df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['VO2'].values
    hr_values = np.delete(hr_values, zero_ind)
    vo2_values = np.delete(vo2_values, zero_ind)
    O2_pulse = vo2_values/hr_values
    return np.std(O2_pulse)

def get_median_O2_pulse(row, df):
    hr_values = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['HR'].values
    zero_ind = np.where(hr_values == 0)[0]
    if len(hr_values) == len(zero_ind):
        return -1
    vo2_values =df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['VO2'].values
    hr_values = np.delete(hr_values, zero_ind)
    vo2_values = np.delete(vo2_values, zero_ind)
    O2_pulse = vo2_values/hr_values
    return np.median(O2_pulse)

#### O2 benchmark functions

In [28]:
def get_O2_percent(row,df):
    max_o2_real = np.max(df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['O2'].values)
    max_o2_expected = row.MaxO2_EST
    return max_o2_real/max_o2_expected

def get_O2_diff(row,df):
    max_o2_real = np.max(df.loc[(df.PatientId == row.PatientId) & (df.SessionId == row.SessionId)]['O2'].values)
    max_o2_expected = row.MaxO2_EST
    return max_o2_real-max_o2_expected

#### Slope functions

In [29]:
def get_first_quarter(row, df, column):
    time_span=df.loc[df.SessionId == row.SessionId].minutes.values.max()-df.loc[df.SessionId == row.SessionId].minutes.values.min()
    last_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.25*time_span
    initial_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()
    y_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)][column].values
    X_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)].minutes.values
    lin_reg = LinearRegression()
    lin_reg.fit(X_val.reshape(-1, 1),y_val.reshape(-1, 1))
    return lin_reg.coef_[0][0]

In [30]:
def get_second_quarter(row, df, column):
    time_span=df.loc[df.SessionId == row.SessionId].minutes.values.max()-df.loc[df.SessionId == row.SessionId].minutes.values.min()
    last_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.5*time_span
    initial_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.25*time_span
    y_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)][column].values
    X_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)].minutes.values
    lin_reg = LinearRegression()
    lin_reg.fit(X_val.reshape(-1, 1),y_val.reshape(-1, 1))
    return lin_reg.coef_[0][0]

In [31]:
def get_third_quarter(row, df, column):
    time_span=df.loc[df.SessionId == row.SessionId].minutes.values.max()-df.loc[df.SessionId == row.SessionId].minutes.values.min()
    last_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.75*time_span
    initial_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.5*time_span
    y_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)][column].values
    X_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)].minutes.values
    lin_reg = LinearRegression()
    lin_reg.fit(X_val.reshape(-1, 1),y_val.reshape(-1, 1))
    return lin_reg.coef_[0][0]

In [32]:
def get_last_quarter(row, df, column):
    time_span=df.loc[df.SessionId == row.SessionId].minutes.values.max()-df.loc[df.SessionId == row.SessionId].minutes.values.min()
    last_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+time_span
    initial_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.75*time_span
    y_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)][column].values
    X_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)].minutes.values
    lin_reg = LinearRegression()
    lin_reg.fit(X_val.reshape(-1, 1),y_val.reshape(-1, 1))
    return lin_reg.coef_[0][0]

In [33]:
def get_first_half(row, df, column):
    time_span=df.loc[df.SessionId == row.SessionId].minutes.values.max()-df.loc[df.SessionId == row.SessionId].minutes.values.min()
    last_half_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.5*time_span
    initial_half_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()
    y_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_half_time)& 
                   (df.minutes <= last_half_time)][column].values
    X_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_half_time)& 
                   (df.minutes <= last_half_time)].minutes.values
    lin_reg = LinearRegression()
    lin_reg.fit(X_val.reshape(-1, 1),y_val.reshape(-1, 1))
    return lin_reg.coef_[0][0]

In [34]:
def get_second_half(row, df, column):
    time_span=df.loc[df.SessionId == row.SessionId].minutes.values.max()-df.loc[df.SessionId == row.SessionId].minutes.values.min()
    last_half_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+time_span
    initial_half_time = df.loc[df.SessionId == row.SessionId].minutes.values.min()+.5*time_span
    y_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_half_time)& 
                   (df.minutes <= last_half_time)][column].values
    X_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_half_time)& 
                   (df.minutes <= last_half_time)].minutes.values
    lin_reg = LinearRegression()
    lin_reg.fit(X_val.reshape(-1, 1),y_val.reshape(-1, 1))
    return lin_reg.coef_[0][0]

In [35]:
def get_slope_15_85(row, df, column):
    last_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.max()*.85
    initial_quarter_time = df.loc[df.SessionId == row.SessionId].minutes.values.max()*.15
    y_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)][column].values
    X_val = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId) & 
                   (df.minutes >= initial_quarter_time)& 
                   (df.minutes <= last_quarter_time)].minutes.values
    lin_reg = LinearRegression()
    lin_reg.fit(X_val.reshape(-1, 1),y_val.reshape(-1, 1))
    return lin_reg.coef_[0][0]

#### AT functions

In [36]:
def get_vt_time(row, df):
    df_vals = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][['VE/VCO2','VE/VO2','minutes']]
    minmax = np.max(df_vals.minutes.values)
    df_selected = df_vals.loc[(df_vals['VE/VO2']>=df_vals['VE/VCO2']) & (df_vals.minutes>=(minmax*1/2.5))]
    if df_selected.shape[0]<=2:
        return -1
    for minute in df_selected.minutes.values:
        df_vals_times = df_vals.loc[(df_vals.minutes>=minute-0.5) & (df_vals.minutes<=minute+1)]
        if df_vals_times.shape[0]<4:
            continue
        is_minor = np.round(df_vals_times.iloc[0]['VE/VCO2'])>= np.round(df_vals_times.iloc[0]['VE/VO2'])
        is_growing_1 = df_vals_times.iloc[1]['VE/VO2']< df_vals_times.iloc[2]['VE/VO2']
        is_growing_2 = df_vals_times.iloc[1]['VE/VO2']< df_vals_times.iloc[3]['VE/VO2']
        if((is_minor & is_growing_1) | (is_minor & is_growing_2))==True:
            return minute
        pass
    return -1
    pass

In [37]:
def get_vt_index(row, df):
    df_vals = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)][['VE/VCO2','VE/VO2','minutes']]
    minutes_list = df_vals.minutes
    minmax = np.max(df_vals.minutes.values)
    df_selected = df_vals.loc[(df_vals['VE/VO2']>=df_vals['VE/VCO2']) & (df_vals.minutes>=(minmax*1/2.5))]
    if df_selected.shape[0]<=2:
        return -1
    for minute in df_selected.minutes.values:
        df_vals_times = df_vals.loc[(df_vals.minutes>=minute-0.5) & (df_vals.minutes<=minute+1)]
        if df_vals_times.shape[0]<4:
            continue
        is_minor = np.round(df_vals_times.iloc[0]['VE/VCO2'])>= np.round(df_vals_times.iloc[0]['VE/VO2'])
        is_growing_1 = df_vals_times.iloc[1]['VE/VO2']< df_vals_times.iloc[2]['VE/VO2']
        is_growing_2 = df_vals_times.iloc[1]['VE/VO2']< df_vals_times.iloc[3]['VE/VO2']
        if((is_minor & is_growing_1) | (is_minor & is_growing_2))==True:
            index = np.where(minutes_list == minute)
            return index[0][0]
        pass
    return -1
    pass

In [38]:
def get_VO2atVT(row, df):
    rer_list = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['RER'].values
    at_index = get_vt_index(row, df)
    if at_index == -1:
        return -1
    vo2_list = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['VO2'].values
    return vo2_list[at_index]

In [39]:
def get_percent_time_after_VT(row, df):
    rer_list = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['RER'].values
    at_index = get_vt_index(row, df)
    if at_index == -1:
        return 0
    df_times = df.loc[(df.PatientId == row.PatientId) &
                   (df.SessionId == row.SessionId)]['minutes'].values
    return df_times[at_index]/df_times[-1]

### Generating the datasets
In this section we will generate a dataset depending on our needs this has two function the one that generates everything while to other generates the data for short term

In [40]:
def generate_cpet_data_full(df_data_sampled, df_patient_info,  file_name):
    #getting sessions
    data_resume_cpet_df = pd.DataFrame({ 'SessionId':df_data_sampled.SessionId.drop_duplicates().values})
    data_resume_cpet_df['PatientId'] = np.floor(data_resume_cpet_df.SessionId)
    df_data_sampled = df_data_sampled.dropna()
    
    #Joining with personal data
    data_resume_cpet_df['sex']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('sex',x,df_patient_info),axis=1)
    data_resume_cpet_df['age']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('age',x,df_patient_info),axis=1)
    data_resume_cpet_df['BMI']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('BMI',x,df_patient_info),axis=1)
    data_resume_cpet_df['MaxVO2_EST']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('MaxVO2_EST',x,df_patient_info),axis=1)
    data_resume_cpet_df['MaxO2_EST']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('MaxO2_EST',x,df_patient_info),axis=1)
    data_resume_cpet_df['PredictedMaxHR']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('PredictedMaxHR',x,df_patient_info),axis=1)
    data_resume_cpet_df['CardiacLim']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('BA-PrimaryCardiacLim',x,df_patient_info),axis=1)
    data_resume_cpet_df['PulmonaryLim']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('BA-PrimaryPulmonaryLim',x,df_patient_info),axis=1)
    data_resume_cpet_df['MuscleSkeletalLim']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('OtherPrimaryLim',x,df_patient_info),axis=1)
    data_resume_cpet_df['Healthy']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('Healthy',x,df_patient_info),axis=1)
    
    data_resume_cpet_df['PeakHeartRate']=data_resume_cpet_df.apply(lambda x: get_highest_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanHeartRate']=data_resume_cpet_df.apply(lambda x: get_mean_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MinHeartRate']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['StdHeartRate']=data_resume_cpet_df.apply(lambda x: get_std_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['StdVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_std_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVO2Real']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPeakVO2']=data_resume_cpet_df.apply(lambda x: get_MaxVO2_expected_vs_real(x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPeakHR']=data_resume_cpet_df.apply(lambda x: get_HR_diff(x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPercentPeakVO2']=data_resume_cpet_df.apply(lambda x: get_VO2_percent(x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPercentPeakHR']=data_resume_cpet_df.apply(lambda x: get_HR_percent(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanRER']=data_resume_cpet_df.apply(lambda x: get_mean_variable('RER',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakRER']=data_resume_cpet_df.apply(lambda x: get_highest_variable('RER',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestRER']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('RER',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVE']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VE',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVE']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VE',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVE']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VE',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanRR']=data_resume_cpet_df.apply(lambda x: get_mean_variable('RR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakRR']=data_resume_cpet_df.apply(lambda x: get_highest_variable('RR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestRR']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('RR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVO2']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVO2']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVCO2']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVCO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVCO2']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['HRvsVO2Slope']=data_resume_cpet_df.apply(lambda x: get_HR_VO2_slope(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VEvsVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_VE_VCO2_slope(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanO2Pulse']=data_resume_cpet_df.apply(lambda x: get_mean_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MaxO2Pulse']=data_resume_cpet_df.apply(lambda x: get_max_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MinO2Pulse']=data_resume_cpet_df.apply(lambda x: get_min_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['StdO2Pulse']=data_resume_cpet_df.apply(lambda x: get_std_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['O2PulseDiff']=data_resume_cpet_df.MaxO2Pulse-data_resume_cpet_df.MaxO2_EST
    data_resume_cpet_df['O2PulsePercent']=data_resume_cpet_df.MaxO2Pulse/data_resume_cpet_df.MaxO2_EST
    # Given that this is an early detection case
    # First half
    data_resume_cpet_df['0_to_25_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['0_to_25_HRSlope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['0_to_25_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['0_to_25_VESlope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['0_to_25_RERSlope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['0_to_25_RRSlope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['0_to_25_O2Slope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['0_to_25_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['0_to_25_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_quarter(x,df_data_sampled,'VE/VO2'),axis=1)
    # Second Quarter
    data_resume_cpet_df['25_to_50_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['25_to_50_HRSlope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['25_to_50_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['25_to_50_VESlope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['25_to_50_RERSlope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['25_to_50_RRSlope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['25_to_50_O2Slope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['25_to_50_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['25_to_50_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_quarter(x,df_data_sampled,'VE/VO2'),axis=1)
    # Third quarter
    data_resume_cpet_df['50_to_75_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['50_to_75_HRSlope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['50_to_75_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['50_to_75_VESlope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['50_to_75_RERSlope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['50_to_75_RRSlope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['50_to_75_O2Slope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['50_to_75_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['50_to_75_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_third_quarter(x,df_data_sampled,'VE/VO2'),axis=1)
    # Last quarter
    data_resume_cpet_df['75_to_100_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['75_to_100_HRSlope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['75_to_100_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['75_to_100_VESlope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['75_to_100_RERSlope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['75_to_100_RRSlope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['75_to_100_O2Slope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['75_to_100_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['75_to_100_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_last_quarter(x,df_data_sampled,'VE/VO2'),axis=1)
    # 15 to 85 percent
    data_resume_cpet_df['15_to_85_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['15_to_85_HRSlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['15_to_85_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['15_to_85_VESlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['15_to_85_RERSlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['15_to_85_RRSlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['15_to_85_O2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['15_to_85_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['15_to_85_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VE/VO2'),axis=1)

    #data_resume_cpet_df['VTTime']=data_resume_cpet_df.apply(lambda x: get_vt_time(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VTTime']=data_resume_cpet_df.apply(lambda x: get_vt_time(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VO2atVT']=data_resume_cpet_df.apply(lambda x: get_VO2atVT(x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PercentTimeAfterVT']=data_resume_cpet_df.apply(lambda x: get_percent_time_after_VT(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VO2vsPeakVO2atVT'] = data_resume_cpet_df['VO2atVT']/data_resume_cpet_df['MaxVO2_EST']
    return data_resume_cpet_df
    pass

In [41]:
result_full = generate_cpet_data_full(df_info_avg, patient_info, 'file_name')

In [165]:
result_full.to_csv('./data/full_data.csv', index = True) 

In [42]:
def generate_cpet_data_by_time(df_data_sampled, df_patient_info,  file_name, time_limit=8):
    #getting sessions
    data_resume_cpet_df = pd.DataFrame({ 'SessionId':df_data_sampled.SessionId.drop_duplicates().values})
    data_resume_cpet_df['PatientId'] = np.floor(data_resume_cpet_df.SessionId)
    df_data_sampled = df_data_sampled.dropna()
    df_data_sampled = df_data_sampled.loc[df_data_sampled.minutes<=time_limit]
    
    #Joining with personal data
    data_resume_cpet_df['sex']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('sex',x,df_patient_info),axis=1)
    data_resume_cpet_df['age']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('age',x,df_patient_info),axis=1)
    data_resume_cpet_df['BMI']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('BMI',x,df_patient_info),axis=1)
    data_resume_cpet_df['MaxVO2_EST']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('MaxVO2_EST',x,df_patient_info),axis=1)
    data_resume_cpet_df['MaxO2_EST']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('MaxO2_EST',x,df_patient_info),axis=1)
    data_resume_cpet_df['PredictedMaxHR']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('PredictedMaxHR',x,df_patient_info),axis=1)
    data_resume_cpet_df['CardiacLim']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('BA-PrimaryCardiacLim',x,df_patient_info),axis=1)
    data_resume_cpet_df['PulmonaryLim']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('BA-PrimaryPulmonaryLim',x,df_patient_info),axis=1)
    data_resume_cpet_df['MuscleSkeletalLim']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('OtherPrimaryLim',x,df_patient_info),axis=1)
    data_resume_cpet_df['Healthy']=data_resume_cpet_df.apply(lambda x: place_label_fake_join('Healthy',x,df_patient_info),axis=1)
    
    data_resume_cpet_df['PeakHeartRate']=data_resume_cpet_df.apply(lambda x: get_highest_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanHeartRate']=data_resume_cpet_df.apply(lambda x: get_mean_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MinHeartRate']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['StdHeartRate']=data_resume_cpet_df.apply(lambda x: get_std_variable('HR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['StdVE/VCO2']=data_resume_cpet_df.apply(lambda x: get_std_variable('VE/VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVO2Real']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPeakVO2']=data_resume_cpet_df.apply(lambda x: get_MaxVO2_expected_vs_real(x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPeakHR']=data_resume_cpet_df.apply(lambda x: get_HR_diff(x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPercentPeakVO2']=data_resume_cpet_df.apply(lambda x: get_VO2_percent(x,df_data_sampled),axis=1)
    data_resume_cpet_df['DiffPercentPeakHR']=data_resume_cpet_df.apply(lambda x: get_HR_percent(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanRER']=data_resume_cpet_df.apply(lambda x: get_mean_variable('RER',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakRER']=data_resume_cpet_df.apply(lambda x: get_highest_variable('RER',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestRER']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('RER',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVE']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VE',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVE']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VE',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVE']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VE',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanRR']=data_resume_cpet_df.apply(lambda x: get_mean_variable('RR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakRR']=data_resume_cpet_df.apply(lambda x: get_highest_variable('RR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestRR']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('RR',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVO2']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVO2']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanVCO2']=data_resume_cpet_df.apply(lambda x: get_mean_variable('VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVCO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['LowestVCO2']=data_resume_cpet_df.apply(lambda x: get_lowest_variable('VCO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['HRvsVO2Slope']=data_resume_cpet_df.apply(lambda x: get_HR_VO2_slope(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VEvsVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_VE_VCO2_slope(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MeanO2Pulse']=data_resume_cpet_df.apply(lambda x: get_mean_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MaxO2Pulse']=data_resume_cpet_df.apply(lambda x: get_max_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['MinO2Pulse']=data_resume_cpet_df.apply(lambda x: get_min_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['StdO2Pulse']=data_resume_cpet_df.apply(lambda x: get_std_O2_pulse(x,df_data_sampled),axis=1)
    data_resume_cpet_df['O2PulseDiff']=data_resume_cpet_df.MaxO2Pulse-data_resume_cpet_df.MaxO2_EST
    data_resume_cpet_df['O2PulsePercent']=data_resume_cpet_df.MaxO2Pulse/data_resume_cpet_df.MaxO2_EST
    # Given that this is an early detection case
    # First half
    data_resume_cpet_df['first_half_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['first_half_HRSlope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['first_half_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['first_half_VESlope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['first_half_RERSlope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['first_half_RRSlope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['first_half_O2Slope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['first_half_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['first_half_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_first_half(x,df_data_sampled,'VE/VO2'),axis=1)
    # Second half
    data_resume_cpet_df['second_half_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['second_half_HRSlope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['second_half_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['second_half_VESlope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['second_half_RERSlope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['second_half_RRSlope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['second_half_O2Slope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['second_half_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['second_half_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_second_half(x,df_data_sampled,'VE/VO2'),axis=1)
    # 15 to 85 percent
    data_resume_cpet_df['15_to_85_VO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VO2'),axis=1)
    data_resume_cpet_df['15_to_85_HRSlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'HR'),axis=1)
    data_resume_cpet_df['15_to_85_VCO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VCO2'),axis=1)
    data_resume_cpet_df['15_to_85_VESlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VE'),axis=1)
    data_resume_cpet_df['15_to_85_RERSlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'RER'),axis=1)
    data_resume_cpet_df['15_to_85_RRSlope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'RR'),axis=1)
    data_resume_cpet_df['15_to_85_O2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'O2'),axis=1)
    data_resume_cpet_df['15_to_85_VEVCO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VE/VCO2'),axis=1)
    data_resume_cpet_df['15_to_85_VEVO2Slope']=data_resume_cpet_df.apply(lambda x: get_slope_15_85(x,df_data_sampled,'VE/VO2'),axis=1)

    #data_resume_cpet_df['VTTime']=data_resume_cpet_df.apply(lambda x: get_vt_time(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VTTime']=data_resume_cpet_df.apply(lambda x: get_vt_time(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VO2atVT']=data_resume_cpet_df.apply(lambda x: get_VO2atVT(x,df_data_sampled),axis=1)
    data_resume_cpet_df['PeakVO2']=data_resume_cpet_df.apply(lambda x: get_highest_variable('VO2',x,df_data_sampled),axis=1)
    data_resume_cpet_df['PercentTimeAfterVT']=data_resume_cpet_df.apply(lambda x: get_percent_time_after_VT(x,df_data_sampled),axis=1)
    data_resume_cpet_df['VO2vsPeakVO2atVT'] = data_resume_cpet_df['VO2atVT']/data_resume_cpet_df['MaxVO2_EST']
    return data_resume_cpet_df
    pass

In [43]:
result = generate_cpet_data_by_time(df_info_avg, patient_info, 'file_name')

In [145]:
result.head()

Unnamed: 0,SessionId,PatientId,sex,age,BMI,MaxVO2_EST,MaxO2_EST,PredictedMaxHR,CardiacLim,PulmonaryLim,...,15_to_85_VESlope,15_to_85_RERSlope,15_to_85_RRSlope,15_to_85_O2Slope,15_to_85_VEVCO2Slope,15_to_85_VEVO2Slope,VTTime,VO2atVT,PercentTimeAfterVT,VO2vsPeakVO2atVT
0,7.0,7.0,F,40,35.915381,2.319521,0.010556,180.0,1,0,...,1.477203,-0.002217,0.617583,0.767727,-2.538605,-2.05157,-1.0,-1.0,0.0,-0.431123
1,8.0,8.0,M,69,35.057157,2.65721,0.013601,159.7,1,0,...,2.069995,-0.006356,1.061668,0.698637,-0.848303,-0.883907,-1.0,-1.0,0.0,-0.376335
2,9.0,9.0,F,16,19.135218,1.444581,0.0103,196.8,0,0,...,8.615519,-0.008381,5.446448,2.058699,-3.033679,-2.759117,-1.0,-1.0,0.0,-0.692242
3,10.0,10.0,M,17,19.3584,2.293821,0.015833,196.1,0,0,...,9.441237,-0.05302,3.934768,2.886351,-2.96337,-4.646961,-1.0,-1.0,0.0,-0.435954
4,11.0,11.0,F,15,20.582507,1.933529,0.0103,197.5,0,0,...,13.500423,0.016762,2.955699,2.665404,-3.945277,-3.252611,-1.0,-1.0,0.0,-0.517189


In [147]:
result.columns

Index(['SessionId', 'PatientId', 'sex', 'age', 'BMI', 'MaxVO2_EST',
       'MaxO2_EST', 'PredictedMaxHR', 'CardiacLim', 'PulmonaryLim',
       'MuscleSkeletalLim', 'Healthy', 'PeakHeartRate', 'MeanHeartRate',
       'MinHeartRate', 'StdHeartRate', 'LowestVE/VCO2', 'PeakVE/VCO2',
       'MeanVE/VCO2', 'StdVE/VCO2', 'PeakVO2Real', 'DiffPeakVO2', 'DiffPeakHR',
       'DiffPercentPeakVO2', 'DiffPercentPeakHR', 'MeanRER', 'PeakRER',
       'LowestRER', 'MeanVE', 'PeakVE', 'LowestVE', 'MeanRR', 'PeakRR',
       'LowestRR', 'MeanVO2', 'PeakVO2', 'LowestVO2', 'MeanVCO2', 'PeakVCO2',
       'LowestVCO2', 'HRvsVO2Slope', 'VEvsVCO2Slope', 'MeanO2Pulse',
       'MaxO2Pulse', 'MinO2Pulse', 'StdO2Pulse', 'O2PulseDiff',
       'O2PulsePercent', 'first_half_VO2Slope', 'first_half_HRSlope',
       'first_half_VCO2Slope', 'first_half_VESlope', 'first_half_RERSlope',
       'first_half_RRSlope', 'first_half_O2Slope', 'first_half_VEVCO2Slope',
       'first_half_VEVO2Slope', 'second_half_VO2Slope', 'secon

In [146]:
result.to_csv('./data/8_min_data_updated_corrected.csv', index = True) 

Saving the data for the cardiac limitation

In [148]:
cardiac_early_data = result[['CardiacLim','second_half_VCO2Slope','second_half_O2Slope','second_half_VO2Slope','StdO2Pulse',
'second_half_HRSlope','15_to_85_VO2Slope','second_half_VESlope','15_to_85_VEVCO2Slope','PeakVO2',
'PeakVO2Real','15_to_85_VEVO2Slope','DiffPercentPeakVO2','MinO2Pulse','15_to_85_O2Slope',
'MeanO2Pulse','DiffPercentPeakHR','15_to_85_VCO2Slope','MaxO2Pulse']]

In [149]:
cardiac_early_data.to_csv('./data/short_cardiac_data.csv', index = True) 

In [152]:
pulmonary_early_data = result[['PulmonaryLim','DiffPercentPeakVO2','O2PulsePercent','O2PulseDiff','MeanVE',
'MeanVCO2','15_to_85_VCO2Slope','MaxO2Pulse','PeakVO2Real','PeakVO2','PeakVCO2','MeanVO2',
'MeanO2Pulse','15_to_85_O2Slope','LowestVE/VCO2','MeanHeartRate','MeanRR','first_half_RRSlope']]

In [153]:
pulmonary_early_data.to_csv('./data/short_pulmonary_data.csv', index = True) 

In [154]:
other_early_data = result[['MuscleSkeletalLim','LowestVE/VCO2','first_half_VO2Slope','15_to_85_VEVCO2Slope','second_half_RERSlope',
'O2PulseDiff','O2PulsePercent','StdHeartRate','DiffPercentPeakVO2','VEvsVCO2Slope','15_to_85_VO2Slope','MeanVE/VCO2',
'first_half_O2Slope','MeanVO2','PeakVO2','MeanVCO2','PeakVCO2']]

In [155]:
other_early_data.to_csv('./data/short_other_data.csv', index = True) 