In [1]:
import pandas as pd
import numpy as np

# Data visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import set_config
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("/content/drive/MyDrive/501 Final Project/code/data/ml.csv")

In [3]:

df.drop(df.loc[df['masking_level']==-1].index, inplace=True)
attr = df['attrition_percentage']
df = df.drop(['attrition_percentage' , 'nct_id'] , axis=1)
df = df.drop(['gender'] , axis= 1)

In [None]:
len(df)

965

In [4]:
df_temp = df

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(df_temp, attr, test_size = 0.2, random_state = 42)

print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)
print("X_test shape: ", X_test.shape)

print("Y_test shape: ", Y_test.shape)

X_train shape:  (772, 13)
Y_train shape:  (772,)
X_test shape:  (193, 13)
Y_test shape:  (193,)


In [None]:
X_train.shape

(772, 13)

In [None]:
df['masking_level'].value_counts()

4    298
0    271
2    263
3    133
Name: masking_level, dtype: int64

In [5]:
numerical_features = ['date_diff_num' , 'location_count']
categorical_features = [ 'allocation','intervention_model' , 'primary_purpose' , 'intervention_type' , 'arm_group_type'  , 'healthy_vols' , 'pt_mask']
age_cat = ['infant' , 'child' , 'teen' , 'adult' , 'late adult' ]
phase_cat = ['Phase 1.5','Phase 2' , 'Phase 2.5' , 'Phase 3' , 'Phase 4']
masking_level = [0,1,2,3,4]


In [6]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[ ("onehot", OneHotEncoder(handle_unknown="error" ,drop="if_binary"))]
)

ordinal_age_transformer = Pipeline(
    steps=[("ordinal_age", OrdinalEncoder(categories = [age_cat] , handle_unknown='error' )  )]
)


ordinal_phase_transformer = Pipeline(
    steps=[("ordinal_phase", OrdinalEncoder(categories = [phase_cat] , handle_unknown='error' )  )]
)

ordinal_masking_transformer = Pipeline(
    steps=[("ordinal_mask", OrdinalEncoder(categories = [masking_level] , handle_unknown='error' )  )]
)



In [7]:
col_transformer = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numerical_features),
        ("categorical", categorical_transformer, categorical_features),
        ("oe_min_age" , ordinal_age_transformer , ['min_age_cat' ]),
        ("oe_max_age" , ordinal_age_transformer , ['max_age_cat']),
        ("oe_phase" , ordinal_phase_transformer , ['phase']),
        ("oe_masking" , ordinal_masking_transformer, ['masking_level'])
    ],
    remainder='passthrough'
)

In [10]:
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numeric',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['date_diff_num', 'location_count']),
                                ('categorical',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(drop='if_binary'))]),
                                 ['allocation', 'intervention_model',
                                  'primary_purpose', 'intervention_type',
                                  'arm_group_type', 'healthy_vols',
                                  'pt_mask']),
                                ('oe_min...
                                                  OrdinalEncoder(categories=[['infant',
                                                                              'child',
                                                                              'teen',
   

In [8]:
columns = ['date_diff_num',
 'location_count',
 'allocation_Randomized',
 'intervention_model_Crossover Assignment',
 'intervention_model_Factorial Assignment',
 'intervention_model_Parallel Assignment',
 'intervention_model_Single Group Assignment',
 'primary_purpose_Basic Science',
 'primary_purpose_Diagnostic',
 'primary_purpose_Health Services Research',
 'primary_purpose_Other',
 'primary_purpose_Prevention',
 'primary_purpose_Screening',
 'primary_purpose_Supportive Care',
 'primary_purpose_Treatment',
 'intervention_type_Biological',
 'intervention_type_Device',
 'intervention_type_Dietary Supplement',
 'intervention_type_Drug',
 'intervention_type_Other',
 'intervention_type_Radiation',
 'arm_group_type_Active Comparator',
 'arm_group_type_Experimental',
 'arm_group_type_Other',
 'arm_group_type_Placebo Comparator',
 'healthy_vols_No',
 'pt_mask_Yes',
 'min_age_cat',
 'max_age_cat',
 'phase',
 'masking_level']

In [12]:
X_train_pp = col_transformer.transform(X_train)
df_train = pd.DataFrame(X_train_pp, columns=columns)
df_train['pt_mask_Yes'].value_counts()

1.0    518
0.0    254
Name: pt_mask_Yes, dtype: int64

In [9]:
lr_pipe = Pipeline(
    steps =[
        ("preprocessing" , col_transformer),
        ('regressor' ,LinearRegression() )
    ]
)

In [12]:
lr_pipe.fit(X_train, Y_train)
lm_predictions = lr_pipe.predict(X_test)

print("First 5 LM predictions: ", list(lm_predictions[:5]))
lm_mae = mean_absolute_error(lm_predictions, Y_test)
lm_rmse =  np.sqrt(mean_squared_error(lm_predictions, Y_test))
lm_r2 = r2_score(Y_test , lm_predictions  )
print("LM R2: {:.2f}".format(round(lm_r2, 2)))
print("LM MAE: {:.2f}".format(round(lm_mae, 2)))
print("LM RMSE: {:.2f}".format(round(lm_rmse, 2)))


First 5 LM predictions:  [11.09235520688383, 8.841475272474234, 10.735265057583032, 10.223897329126418, 11.49817079601567]
LM R2: -0.04
LM MAE: 7.98
LM RMSE: 10.38


In [20]:
len(Y_test)

193

In [13]:
gbm_pipe = Pipeline(
    steps =[
        ("preprocessing" , col_transformer),
        ('gbm' ,GradientBoostingRegressor() )
    ]
)



In [14]:
gbm_pipe.fit(X_train, Y_train)
gbm_predictions = gbm_pipe.predict(X_test)

print("First 5 GBM predictions: ", list(gbm_predictions[:5]))
gbm_mae = mean_absolute_error(gbm_predictions, Y_test)
gbm_rmse =  np.sqrt(mean_squared_error(gbm_predictions, Y_test))
gbm_r2 = r2_score( Y_test,gbm_predictions )
print("GBM R2: {:.2f}".format(round(gbm_r2, 2)))
print("GBM MAE: {:.2f}".format(round(gbm_mae, 2)))
print("GBM RMSE: {:.2f}".format(round(gbm_rmse, 2)))


First 5 GBM predictions:  [7.598475563639472, 7.23271328626186, 10.133172856004256, 7.429527112181727, 11.8354337123399]
GBM R2: -0.13
GBM MAE: 8.31
GBM RMSE: 10.80


In [15]:
rf_pipe = Pipeline(
    steps =[
        ("preprocessing" , col_transformer),
        ('rf' ,RandomForestRegressor(n_estimators = 100) )
    ]
)



In [16]:

rf_pipe.fit(X_train , Y_train)
rf_predictions = rf_pipe.predict(X_test)
print("First 5 RF predictions: ", list(rf_predictions[:5]))
rf_mae = mean_absolute_error(rf_predictions, Y_test)
rf_rmse =  np.sqrt(mean_squared_error(rf_predictions, Y_test))
rf_r2 = r2_score( Y_test, rf_predictions )
print("RF R2: {:.2f}".format(round(rf_r2, 2)))
print("RF MAE: {:.2f}".format(round(rf_mae, 2)))
print("RF RMSE: {:.2f}".format(round(rf_rmse, 2)))

First 5 RF predictions:  [9.037486450632999, 6.382023737149002, 7.448683678744, 7.850173467453991, 9.785583633153005]
RF R2: -0.23
RF MAE: 8.74
RF RMSE: 11.28


In [None]:
val = list(zip(lr_pipe.steps[1][1].coef_, columns))

In [None]:
val.sort()

In [None]:
df_val = pd.DataFrame(val)

In [None]:
df_val.to_csv("/content/drive/MyDrive/501 Final Project/sort.csv" , index=False)

In [26]:
importance = lr_pipe.steps[1][1].coef_
# summarize feature importance
for i,v in enumerate(importance):
 print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.62404
Feature: 1, Score: 0.20329
Feature: 2, Score: 1.16874
Feature: 3, Score: -4.29485
Feature: 4, Score: 6.15896
Feature: 5, Score: -2.99000
Feature: 6, Score: 1.12588
Feature: 7, Score: -3.11972
Feature: 8, Score: -1.60791
Feature: 9, Score: 3.84013
Feature: 10, Score: 1.56143
Feature: 11, Score: 3.31544
Feature: 12, Score: -6.72118
Feature: 13, Score: 3.29801
Feature: 14, Score: -0.56619
Feature: 15, Score: 0.93277
Feature: 16, Score: -1.20880
Feature: 17, Score: 1.73259
Feature: 18, Score: 2.56497
Feature: 19, Score: 3.08605
Feature: 20, Score: -7.10758
Feature: 21, Score: 1.38246
Feature: 22, Score: -1.25854
Feature: 23, Score: -0.93818
Feature: 24, Score: 0.81426
Feature: 25, Score: 0.13080
Feature: 26, Score: -0.65646
Feature: 27, Score: 0.04152
Feature: 28, Score: -0.20722
Feature: 29, Score: -0.30476
Feature: 30, Score: 0.11162


In [None]:
set_config(display='diagram')
lr_pipe

In [None]:
set_config(display='diagram')
gbm_pipe

In [None]:
set_config(display='diagram')
rf_pipe

In [None]:
df.head()

Unnamed: 0,phase,allocation,intervention_model,primary_purpose,intervention_type,arm_group_type,healthy_vols,location_count,min_age_cat,max_age_cat,pt_mask,masking_level,date_diff_num
0,Phase 2,Randomized,Parallel Assignment,Treatment,Drug,Active Comparator,No,1,teen,late adult,Yes,4,3895
1,Phase 2,Randomized,Parallel Assignment,Treatment,Drug,Placebo Comparator,No,1,child,teen,Yes,4,4901
2,Phase 3,Randomized,Parallel Assignment,Treatment,Drug,Experimental,No,1,teen,late adult,Yes,2,3318
3,Phase 3,Randomized,Parallel Assignment,Treatment,Drug,Active Comparator,No,408,teen,late adult,No,0,2587
4,Phase 3,Randomized,Parallel Assignment,Screening,Device,Experimental,Accepts Healthy Volunteers,33,adult,adult,No,0,2983


In [17]:
df_num_lm = df[['location_count' , 'date_diff_num']]

In [18]:
col_transformer_numerical = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numerical_features),
        
    ],
    remainder='passthrough'
)

In [19]:
lr_pipe_numerical = Pipeline(
    steps =[
        ("preprocessing" , col_transformer_numerical),
        ('regressor' ,LinearRegression() )
    ]
)

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(df_num_lm, attr, test_size = 0.2, random_state = 42)

In [21]:
lr_pipe_numerical.fit(X_train, Y_train)
lm_predictions_num = lr_pipe_numerical.predict(X_test)

print("First 5 LM predictions: ", list(lm_predictions_num[:5]))
lm_mae = mean_absolute_error(lm_predictions_num, Y_test)
lm_rmse =  np.sqrt(mean_squared_error(lm_predictions_num, Y_test))
lm_r2_num = r2_score(Y_test ,lm_predictions_num  )
print("LM R2: {:.2f}".format(round(lm_r2_num, 2)))
print("LM MAE: {:.2f}".format(round(lm_mae, 2)))
print("LM RMSE: {:.2f}".format(round(lm_rmse, 2)))


First 5 LM predictions:  [10.058404074561228, 10.305739410474429, 11.119603333170609, 10.920501244424225, 10.431764869821281]
LM R2: -0.01
LM MAE: 7.77
LM RMSE: 10.21


In [None]:
val = list(zip(lr_pipe_numerical.steps[1][1].coef_, columns))

In [None]:
val

[(0.5754475599596758, 'date_diff_num'),
 (-0.01395052664018014, 'location_count')]

In [22]:
df_last_lm = df[['location_count' , 'date_diff_num' , 'pt_mask' ,'masking_level' ,'min_age_cat' , 'max_age_cat']]

In [24]:
categorical_features_last = ['pt_mask' ]

In [25]:
col_transformer_last = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numerical_features),
        ("categorical", categorical_transformer, categorical_features_last),
        ("oe_min_age" , ordinal_age_transformer , ['min_age_cat' ]),
        ("oe_max_age" , ordinal_age_transformer , ['max_age_cat']),
        ("oe_masking" , ordinal_masking_transformer, ['masking_level'])
        
    ],
    remainder='passthrough'
)

In [26]:
lr_pipe_last = Pipeline(
    steps =[
        ("preprocessing" , col_transformer_last),
        ('regressor' ,LinearRegression() )
    ]
)

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(df_last_lm, attr, test_size = 0.2, random_state = 42)

In [28]:
lr_pipe_last.fit(X_train, Y_train)
lm_predictions_num = lr_pipe_last.predict(X_test)

print("First 5 LM predictions: ", list(lm_predictions_num[:5]))
lm_mae = mean_absolute_error(lm_predictions_num, Y_test)
lm_rmse =  np.sqrt(mean_squared_error(lm_predictions_num, Y_test))
lm_r2_num = r2_score(Y_test , lm_predictions_num )
print("LM R2: {:.2f}".format(round(lm_r2_num, 2)))
print("LM MAE: {:.2f}".format(round(lm_mae, 2)))
print("LM RMSE: {:.2f}".format(round(lm_rmse, 2)))


First 5 LM predictions:  [10.646126294620608, 10.136131770515894, 11.135309103847426, 10.16198220339639, 10.03658743797846]
LM R2: -0.01
LM MAE: 7.78
LM RMSE: 10.19
