In [None]:
import numpy as np
import pandas as pd
import os

os.chdir("/Users/ashwin/Dropbox (MIT)/MGH Prostate Research Group/RPDR/")
#os.chdir("/Users/dexinli/Dropbox (MIT)/MGH Prostate Research Group/RPDR/")

# Import and Clean Data

In [None]:
merged_df = pd.read_csv('Processed data/merged_data/df_merged_rp_positive_based_E.csv')
merged_df.rename(columns={'Unnamed: 0': 'EMPI'}, inplace = True)
print(len(merged_df.index))

In [None]:
'''
# import other datasets when looking at comorbidity and diabetes features
# ultimately, we did not decide to include these in the final prediction analysis
data_merged_comorb = pd.read_csv('Processed data/merged_data/df_merged_rp_positive_based_C.csv')
data_merged_comorb.set_index('Unnamed: 0', inplace = True)
data_merged_comorb.index.name = 'EMPI'
print(len(data_merged_comorb.index))

empis_oi = set(data_merged.index) & set(data_merged_comorb.index)
comorbs_oi = ['wscore_agg','diab_agg']
data_merged2 = pd.concat([data_merged.loc[empis_oi], data_merged_comorb.loc[empis_oi][comorbs_oi]], axis = 1)
data_merged2.reset_index(inplace=True)

print(len(data_merged2.index))
data_merged2.head()

# Change first column to Unnamed: 0
merged_df = merged_df.rename(columns={"Unnamed: 0": "EMPI"})

# merge data_merged2 back onto data_merged; keep outer, make all the NAs 0
# only keep EMPI, wscore_agg, diab_agg
data_merged2 = data_merged2[["EMPI", "wscore_agg", "diab_agg"]]
merged_df = data_merged.merge(data_merged2, on="EMPI", how="outer")
print(len(merged_df.index))
merged_df.head()

# Make all the NAs 0; we are assuming people with no comorbidities have comorbidity 0 here
# Not an ideal assumption, which is why we ultimately did not go this route
merged_df["wscore_agg"] = merged_df["wscore_agg"].fillna(0)
merged_df["diab_agg"] = merged_df["diab_agg"].fillna(0)

# drop any extra rows with NAs (those only have comorb data)
merged_df = merged_df.dropna()
print(len(merged_df.index))
merged_df.head()
'''

In [None]:
# import the outcome data
outcome_df = pd.read_csv("Processed data/merged_data/df_outcome_rp_positive_final.csv")
outcome_df.head()

In [None]:
# Subset outcome_df so it has the same number of observations as merged_df
x_empi = merged_df[["EMPI"]]
y_df = outcome_df.merge(x_empi, on="EMPI")

print(len(y_df.index)) 
y_df.head()

In [None]:
# We only look at bcr_ind right now (can look at 5-year, 10-year later)
y_df = y_df[["EMPI", "bcr_ind"]]
y_df.head()

In [None]:
# Subset merged_df so it also has the same number of observations as outcome_df
y_empi = outcome_df[["EMPI"]]
x_df = merged_df.merge(y_empi, on="EMPI")

print(len(x_df.index))
x_df.head()

In [None]:
# Sort both columns based on EMPI
x_df = x_df.sort_values(by=['EMPI'])
y_df = y_df.sort_values(by=['EMPI'])

x_df.head()

In [None]:
# Change pT_stage_combined into a numerical feature
print(x_df["pT_stage_combined"].unique())

x_df["pt1"] = np.where((x_df["pT_stage_combined"] == "pt1"), 1, 0)
x_df["pt1a"] = np.where((x_df["pT_stage_combined"] == "pt1a"), 1, 0)
x_df["pt1b"] = np.where((x_df["pT_stage_combined"] == "pt1b"), 1, 0)
x_df["pt2"] = np.where((x_df["pT_stage_combined"] == "pt2") | (x_df["pT_stage_combined"] == "pt2a") |
                    (x_df["pT_stage_combined"] == "pt2b") | (x_df["pT_stage_combined"] == "pt2c"), 1, 0)
x_df["pt3"] = np.where((x_df["pT_stage_combined"] == "pt3"), 1, 0)
x_df["pt3a"] = np.where((x_df["pT_stage_combined"] == "pt3a"), 1, 0)
x_df["pt3b"] = np.where((x_df["pT_stage_combined"] == "pt3b"), 1, 0)
x_df["pt3c"] = np.where((x_df["pT_stage_combined"] == "pt3c"), 1, 0)

# now remove the old pT_stage_combined columns
x_df = x_df.drop(columns=["pT_stage_combined"]) 
x_df.head()

# Split Data into Training, Testing, Validation

In [None]:
from sklearn.model_selection import train_test_split

#  Create train, val, and test datasets using .6, .2, .2 split
validation = 0.2
test = 0.2

X_train_, X_test_, Y_train_, Y_test_ = train_test_split(x_df, y_df, test_size = 1 - validation - test, random_state = 0)
X_val_, X_test_, Y_val_, Y_test_ = train_test_split(X_test_, Y_test_, test_size = test/(test + validation), random_state = 0) 

X_test_.head()

In [None]:
#Oversampling 
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X_train, Y_train = oversample.fit_resample(X_train_, Y_train_['bcr_ind'])
Y_test = Y_test_.set_index('EMPI')
X_test = X_test_.set_index('EMPI')
Y_val = Y_val_.set_index('EMPI')
X_val = X_val_.set_index('EMPI')
X_train = X_train.set_index('EMPI')

In [None]:
# Run this when NOT doing oversampling
Y_test = Y_test_.set_index('EMPI')
X_test = X_test_.set_index('EMPI')
Y_val = Y_val_.set_index('EMPI')
X_val = X_val_.set_index('EMPI')
X_train = X_train_.set_index('EMPI')
Y_train = Y_train_.set_index('EMPI')

In [None]:
# Normalize training dataset to have zero mean and unit variance
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

# Normalize testing and validation data by transforming it via training dataset parameters
X_val_norm = scaler.transform(X_val)
X_test_norm = scaler.transform(X_test)

# Logistic Regression!

In [None]:
#train a logistic regression classifier on the training data while optimizing for best train accuracy
#find the best hyperparameters among C=[0.1,0.25,0.5,1.] and penalty=[‘l1’,’l2’] on validation data.from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

for C_ in [.1, .25, .5, 1]:
    for penalty_ in ["l1", "l2"]:
        if penalty_ == "l1":
            clf = LogisticRegression(penalty = penalty_, C = C_, solver ="liblinear", class_weight = 'balanced').fit(X_train_norm, Y_train)
            print("C = ", C_, "penalty =", penalty_, clf.score(X_val_norm, Y_val))
        else:
            clf = LogisticRegression(penalty = penalty_, C = C_, class_weight = 'balanced').fit(X_train_norm, Y_train)
            print("C = ", C_, "penalty =", penalty_, clf.score(X_val_norm, Y_val))

In [None]:
from xgboost import XGBClassifier

# Run XGBoosot, varying max_depth and min_child_weight
for max_depth in [3, 6, 10]:
    for min_child_weight in [1,5,10]:
        model = XGBClassifier(max_depth = max_depth, min_child_weight = min_child_weight).fit(X_train_norm, Y_train)
        print("max depth = ", max_depth, "min child weight =", min_child_weight, model.score(X_val_norm, Y_val))

In [None]:
# No oversampling, afterwards we retrain model on the training and validation datasets combined
# turns into a 80-20 split
X_train_, X_test_, Y_train_, Y_test_ = train_test_split(x_df, y_df, test_size = 0.2, random_state = 0)
Y_test = Y_test_.set_index('EMPI')
X_test = X_test_.set_index('EMPI')
X_train = X_train_.set_index('EMPI')
Y_train = Y_train_.set_index('EMPI')
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

# Normalize testing and validation data by transforming it via training dataset parameters
X_test_norm = scaler.transform(X_test)

In [None]:
#Oversampling, afterwards we retrain model on the training and validation datasets combined
# turns into a 80-20 split
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X_train_, X_test_, Y_train_, Y_test_ = train_test_split(x_df, y_df, test_size = 0.2, random_state = 0)
X_train, Y_train = oversample.fit_resample(X_train_, Y_train_['bcr_ind'])
Y_test = Y_test_.set_index('EMPI')
X_test = X_test_.set_index('EMPI')
X_train = X_train.set_index('EMPI')
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

# Normalize testing and validation data by transforming it via training dataset parameters
X_test_norm = scaler.transform(X_test)

In [None]:
# we run clf_best
clf_best = LogisticRegression(penalty = "l1", C = .1, solver ="liblinear").fit(X_train_norm, Y_train)

# Check for accuracy on test data
print(clf_best.score(X_test_norm, Y_test))

In [None]:
model_best = XGBClassifier(max_depth = 10, min_child_weight = 1).fit(X_train_norm, Y_train)

# Check for accuracy on test data
print(model_best.score(X_test_norm, Y_test))

In [None]:
# Calculate the AOC on the best Logistic Regression model
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test, clf_best.predict_proba(X_test_norm)[:, 1])

In [None]:
# Calculate the AOC on the best XGBoost Model
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test, model_best.predict_proba(X_test_norm)[:, 1])

In [None]:
# Make confusion matrix of predictions vs reality for best logistic regression model
from sklearn.metrics import confusion_matrix
import numpy as np

test_ypred = np.round(clf_best.predict_proba(X_test_norm)[:, 1])

confusion_matrix(Y_test, test_ypred) # seems like we just don't have enough data points at all,
# our model just predicts everything as no recurrence, because recurrence is so unlikely in our dataset anyway

In [None]:
# Make confusion matrix of predictions vs reality for best XGBoost model
from sklearn.metrics import confusion_matrix
import numpy as np

test_ypred = np.round(model_best.predict_proba(X_test_norm)[:, 1])

confusion_matrix(Y_test, test_ypred) 

In [None]:
# Plot the ROC curve for best logistic regression model
import matplotlib.pyplot as plt  
from sklearn import datasets, metrics, model_selection, svm

metrics.plot_roc_curve(clf_best, X_test_norm, Y_test)  
plt.show() 

In [None]:
# Plot the ROC curve for best XGBoost model
import matplotlib.pyplot as plt  
from sklearn import datasets, metrics, model_selection, svm

metrics.plot_roc_curve(model_best, X_test_norm, Y_test)  
plt.show() 

In [None]:
# calculate f1 score
from sklearn.metrics import f1_score
f1_score(Y_test, test_ypred) # low

In [None]:
# Optimal Threshold Tuning
# search thresholds for imbalanced classification for best linear regression model
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')
 
yhat = clf_best.predict_proba(X_test_norm)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# evaluate each threshold
scores = [f1_score(Y_test, to_labels(probs, t)) for t in thresholds]
# get best threshold
ix_best = argmax(scores)
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix_best], scores[ix_best]))
print(confusion_matrix(Y_test, to_labels(probs, thresholds[ix_best])))

In [None]:
# Optimal Threshold for Precision-Recall Curve
# pr curve for logistic regression model
from sklearn.metrics import precision_recall_curve
from matplotlib import pyplot

# calculate pr-curve
precision, recall, thresholds = precision_recall_curve(Y_test, probs)

precision = precision[:-1]
recall = recall[:-1]

# plot the roc curve for the model
pyplot.plot(thresholds, precision, marker='.', label='Precision')
pyplot.plot(thresholds, recall, marker='.', label='Recall')

# axis labels
pyplot.xlabel('Threshold')
pyplot.ylabel('Precision/Recall')
pyplot.legend()

# show the plot
pyplot.show()

# Check Performance on Different Hospitals

In [None]:
# Import hospital data from diagnoses files
diagnosis = pd.read_csv('Raw Data/First/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)
diagnosis_second = pd.read_csv('Raw Data/Second/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)
diagnosis_third = pd.read_csv('Raw Data/Third/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)
diagnosis_fourth = pd.read_csv('Raw Data/Fourth/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)
diamerged = diagnosis.append([diagnosis_second, diagnosis_third, diagnosis_fourth], sort=True)
diamerged.head()

In [None]:
# we just want EMPI and hospital columns
# drop duplicates
diamerged2 = diamerged[["EMPI", "Hospital"]]
print(len(diamerged2.index))
diamerged2 = diamerged2.drop_duplicates(subset="EMPI")
print(len(diamerged2.index))
print(diamerged2["Hospital"].unique())
print(diamerged2["Hospital"].value_counts())
diamerged2.head()

In [None]:
# Merge Hospital data onto test data
print(len(X_test_))
print(len(Y_test_))

X_test_hospital = X_test_.merge(diamerged2, on="EMPI")
Y_test_hospital = Y_test_.merge(diamerged2, on="EMPI")

print(len(X_test_hospital.index))
print(len(Y_test_hospital.index))
X_test_hospital.head()

In [None]:
# Now we split the test datasets into smaller datasets based on hospital - compare MGH and BWH, two biggest hospitals
# take the indices of mgh in X data, then subset those indices from Y dataset as well

# mgh
mgh_indices = X_test_hospital.index[X_test_hospital["Hospital"] == "MGH"].tolist()
X_test_mgh = X_test_hospital.loc[mgh_indices]
# now we remove Hospital
X_test_mgh = X_test_mgh.drop(["EMPI", "Hospital"], axis=1)

X_test_mgh_norm = scaler.transform(X_test_mgh)
Y_test_mgh = Y_test_hospital.loc[mgh_indices]
Y_test_mgh = Y_test_mgh.drop(["EMPI", "Hospital"], axis=1)

# bwh
bwh_indices = X_test_hospital.index[X_test_hospital["Hospital"] == "BWH"].tolist()
X_test_bwh = X_test_hospital.loc[bwh_indices]
X_test_bwh = X_test_bwh.drop(["EMPI", "Hospital"], axis=1)

X_test_bwh_norm = scaler.transform(X_test_bwh)
Y_test_bwh = Y_test_hospital.loc[bwh_indices]
Y_test_bwh = Y_test_bwh.drop(["EMPI", "Hospital"], axis=1)


In [None]:
# Check size of datasets
print("mgh", len(X_test_mgh.index))
print("bwh", len(X_test_bwh.index))

In [None]:
# Check for accuracy on test data for each population
print(clf_best.score(X_test_mgh_norm, Y_test_mgh))
print(clf_best.score(X_test_bwh_norm, Y_test_bwh))

In [None]:
# get predicted y test values for each population
test_ypred_mgh = np.round(clf_best.predict_proba(X_test_mgh_norm)[:, 1])
test_ypred_bwh = np.round(clf_best.predict_proba(X_test_bwh_norm)[:, 1])

In [None]:
# Calculate the F1 score
print("mgh", f1_score(Y_test_mgh, test_ypred_mgh))
print("bwh", f1_score(Y_test_bwh, test_ypred_bwh))

In [None]:
# Optimal Threshold Tuning for MGH
# search thresholds for imbalanced classification on best overall population linear regression model
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')
 
yhat_mgh = clf_best.predict_proba(X_test_mgh_norm)

# keep probabilities for the positive outcome only
probs_mgh = yhat_mgh[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# evaluate each threshold
scores = [f1_score(Y_test_mgh, to_labels(probs_mgh, t)) for t in thresholds]

# use the best threshold for all populations
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix_best], scores[ix_best]))

In [None]:
# Optimal Threshold Tuning for BWH
# search thresholds for imbalanced classification on best overall population linear regression model
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')
 
yhat = clf_best.predict_proba(X_test_bwh_norm)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# evaluate each threshold
scores = [f1_score(Y_test_bwh, to_labels(probs, t)) for t in thresholds]

# get best threshold
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix_best], scores[ix_best]))

# Check Performance on Different Races

In [None]:
# Now we split the test datasets into smaller datasets based on race
# take the indices of race in X data, then subset those indices from Y dataset as well

# white
white_indices = X_test.index[X_test["White"] == 1].tolist()
X_test_white = X_test.loc[white_indices]
X_test_white_norm = scaler.transform(X_test_white)
Y_test_white = Y_test.loc[white_indices]

# black
black_indices = X_test.index[X_test["Black"] == 1].tolist()
X_test_black = X_test.loc[black_indices]
X_test_black_norm = scaler.transform(X_test_black)
Y_test_black = Y_test.loc[black_indices]

# asian
asian_indices = X_test.index[X_test["Asian"] == 1].tolist()
X_test_asian = X_test.loc[asian_indices]
X_test_asian_norm = scaler.transform(X_test_asian)
Y_test_asian = Y_test.loc[asian_indices]

# hispanic
hispanic_indices = X_test.index[X_test["Hispanic"] == 1].tolist()
X_test_hispanic = X_test.loc[hispanic_indices]
X_test_hispanic_norm = scaler.transform(X_test_hispanic)
Y_test_hispanic = Y_test.loc[hispanic_indices]


In [None]:
# Check size of datasets
print("white", len(X_test_white.index))
print("black", len(X_test_black.index))
print("asian", len(X_test_asian.index))
print("hispanic", len(X_test_hispanic.index))

In [None]:
# Check for accuracy on test data for each population
print("white", clf_best.score(X_test_white_norm, Y_test_white))
print("black", clf_best.score(X_test_black_norm, Y_test_black))
print("asian", clf_best.score(X_test_asian_norm, Y_test_asian))
print("hispanic", clf_best.score(X_test_hispanic_norm, Y_test_hispanic))

In [None]:
# get predicted y test values for each population
test_ypred_white = np.round(clf_best.predict_proba(X_test_white_norm)[:, 1])
test_ypred_black = np.round(clf_best.predict_proba(X_test_black_norm)[:, 1])
test_ypred_asian = np.round(clf_best.predict_proba(X_test_asian_norm)[:, 1])
test_ypred_hispanic = np.round(clf_best.predict_proba(X_test_hispanic_norm)[:, 1])


In [None]:
# Calculate the F1 score
print("white", f1_score(Y_test_white, test_ypred_white))
print("black", f1_score(Y_test_black, test_ypred_black))
print("asian", f1_score(Y_test_asian, test_ypred_asian))
print("hispanic", f1_score(Y_test_hispanic, test_ypred_hispanic))

In [None]:
# Optimal Threshold Tuning for White
# search thresholds for imbalanced classification on best overall population linear regression model
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')
 
yhat = clf_best.predict_proba(X_test_white_norm)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# evaluate each threshold
scores = [f1_score(Y_test_white, to_labels(probs, t)) for t in thresholds]

# get best threshold
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix_best], scores[ix_best]))

In [None]:
# Optimal Threshold Tuning for black
# search thresholds for imbalanced classification on best overall population linear regression model
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')
 
yhat = clf_best.predict_proba(X_test_black_norm)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# evaluate each threshold
scores = [f1_score(Y_test_black, to_labels(probs, t)) for t in thresholds]

# get best threshold
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix_best], scores[ix_best]))

In [None]:
# Optimal Threshold Tuning for asian
# search thresholds for imbalanced classification on best overall population linear regression model
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')
 
yhat = clf_best.predict_proba(X_test_asian_norm)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# evaluate each threshold
scores = [f1_score(Y_test_asian, to_labels(probs, t)) for t in thresholds]

# get best threshold
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix_best], scores[ix_best]))

In [None]:
# Optimal Threshold Tuning for hispanic
# search thresholds for imbalanced classification on best overall population linear regression model
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')
 
yhat = clf_best.predict_proba(X_test_hispanic_norm)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# evaluate each threshold
scores = [f1_score(Y_test_hispanic, to_labels(probs, t)) for t in thresholds]

# get best threshold
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix_best], scores[ix_best]))

# Demographics in Training and Test Sets

In [None]:
# number of patients in test set
len(X_test)

In [None]:
# number of patients in training set
len(X_train)

In [None]:
# finding mean and confidence intervals by race in training set
import math
for var in ['White', 'Black', 'Hispanic', 'Asian']:
    mean = X_train[var].mean()
    std = X_train[var].std()
    print(str(mean-1.96*std/math.sqrt(len(X_train[var]))) + "," + str(mean+1.96*std/math.sqrt(len(X_train[var]))))

In [None]:
# finding mean and confidence intervals by race in test set
import math
for var in ['White', 'Black', 'Hispanic', 'Asian']:
    mean = X_test[var].mean()
    std = X_test[var].std()
    print(str(mean-1.96*std/math.sqrt(len(X_test[var]))) + "," + str(mean+1.96*std/math.sqrt(len(X_test[var]))))

In [None]:
# finding mean and confidence intervals for different features in training set
for var in ['overall_grade_group', 'Age at RP', 'psa_prior_to_rp', 'margin', 'pt2', 'pt3a', 'pt3b']:
    mean = X_train[var].mean()
    std = X_train[var].std()
    print(str(mean-1.96*std/math.sqrt(len(X_train[var]))) + "," + str(mean+1.96*std/math.sqrt(len(X_train[var]))))

In [None]:
# finding mean and confidence intervals for different features in test set
for var in ['overall_grade_group', 'Age at RP', 'psa_prior_to_rp', 'margin', 'pt2', 'pt3a', 'pt3b']:
    mean = X_test[var].mean()
    std = X_test[var].std()
    print(str(mean-1.96*std/math.sqrt(len(X_test[var]))) + "," + str(mean+1.96*std/math.sqrt(len(X_test[var]))))

In [None]:
# finding distribution of grades in training set
for var in range(1,6,1):
    temp = X_train['overall_grade_group'] == var
    mean = temp.mean()
    std = temp.std()
    print(str(mean-1.96*std/math.sqrt(len(temp))) + "," + str(mean+1.96*std/math.sqrt(len(temp))))

In [None]:
# finding distribution of grades in test set
for var in range(1,6,1):
    temp = X_test['overall_grade_group'] == var
    mean = temp.mean()
    std = temp.std()
    print(str(mean-1.96*std/math.sqrt(len(temp))) + "," + str(mean+1.96*std/math.sqrt(len(temp))))

In [None]:
# filtering out rows with null diabetes value in training
X_train = X_train[~X_train['diab_agg'].isnull()]

In [None]:
# mean and confidence intervals for diabetes in training set
mean = X_train['diab_agg'].mean()
std = X_train['diab_agg'].std()
print(mean)
print(str(mean-1.96*std/math.sqrt(len(X_train['diab_agg']))) + "," + str(mean+1.96*std/math.sqrt(len(X_train['diab_agg']))))

In [None]:
# filtering out rows with null diabetes value in test
X_test = X_test[~X_test['diab_agg'].isnull()]

In [None]:
# mean and confidence intervals for diabetes in training set
mean = X_test['diab_agg'].mean()
std = X_test['diab_agg'].std()
print(mean)
print(str(mean-1.96*std/math.sqrt(len(X_test['diab_agg']))) + "," + str(mean+1.96*std/math.sqrt(len(X_test['diab_agg']))))