# Building Predictive Models

In [1]:
# imports
import pandas as pd
import numpy as np
import sklearn
import os

## Import Data

Train and test data were randomly split within R, using 0.80 ratio.
<br>The two dataframes were written to independent csv files, and will be brought into the Python notebook now.

In [27]:
# set path to processed train/test data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.txt')
test_file_path = os.path.join(processed_data_path, 'test.txt')

In [28]:
df_train = pd.read_csv(train_file_path, index_col='Obs')
df_test = pd.read_csv(test_file_path, index_col='Obs')

In [29]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76 entries, 1 to 97
Data columns (total 9 columns):
Unnamed: 0                    76 non-null int64
Y_HighGradeCancer             76 non-null int64
PSALevel                      76 non-null float64
CancerVol                     76 non-null float64
Weight                        76 non-null float64
Age                           76 non-null float64
BenignProstaticHyperplasia    76 non-null float64
SeminalVesicleInvasion        76 non-null int64
CapsularPenetration           76 non-null float64
dtypes: float64(6), int64(3)
memory usage: 5.9 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21 entries, 5 to 95
Data columns (total 9 columns):
Unnamed: 0                    21 non-null int64
Y_HighGradeCancer             21 non-null int64
PSALevel                      21 non-null float64
CancerVol                     21 non-null float64
Weight                        21 non-null float64
Age                           21 non-null float64
Benign

In [30]:
df_train.columns

Index(['Unnamed: 0', 'Y_HighGradeCancer', 'PSALevel', 'CancerVol', 'Weight',
       'Age', 'BenignProstaticHyperplasia', 'SeminalVesicleInvasion',
       'CapsularPenetration'],
      dtype='object')

In [31]:
# drop the redudent columns (R auto-created an index column of its own)
df_train = df_train.drop(columns='Unnamed: 0')
df_test = df_test.drop(columns='Unnamed: 0')

In [35]:
# examine train set
df_train.head()

Unnamed: 0_level_0,Y_HighGradeCancer,PSALevel,CancerVol,Weight,Age,BenignProstaticHyperplasia,SeminalVesicleInvasion,CapsularPenetration
Obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,-2.5337,-1.645747,-0.649508,-1.872101,-0.840562,0,-0.596573
2,0,-2.29925,-1.995368,-0.392167,-0.791989,-0.840562,0,-0.596573
3,0,-2.29925,-1.586043,-0.676493,1.368234,-0.840562,0,-0.596573
4,0,-2.29925,-2.174506,-0.416007,-0.791989,-0.840562,0,-0.596573
6,0,-1.488689,-2.046685,-0.44451,-1.872101,-0.840562,0,-0.596573


In [33]:
# examine test set
df_test.head()

Unnamed: 0_level_0,Y_HighGradeCancer,PSALevel,CancerVol,Weight,Age,BenignProstaticHyperplasia,SeminalVesicleInvasion,CapsularPenetration
Obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,0,-1.837148,-0.511447,-0.321415,-0.251933,-0.840562,0,-0.596573
8,0,-1.418947,-0.562625,-0.24246,-0.791989,0.706307,0,-0.596573
14,0,-0.983519,0.111131,-0.558742,0.423137,-0.840562,0,-0.596573
17,0,-0.878912,-1.509353,-0.257481,0.828178,0.30538,0,-0.450762
23,0,-0.678455,-1.611706,-0.354536,-0.656975,-0.691566,0,-0.596573


In [37]:
# create a list which capture fields to ommit from model
skip = ['Y_HighGradeCancer', 'Weight', 'BenignProstaticHyperplasia', 
        'SeminalVesicleInvasion', 'CapsularPenetration'
       ]
cols_model = [col for col in df_train.columns if col not in skip]
cols_model

['PSALevel', 'CancerVol', 'Age']

## Data Preperation

In [245]:
X_model = df.loc[:, cols_model]
y = df['Y_HighGradeCancer']

In [246]:
print(X_model.shape, y.shape)

(97, 2) (97,)


In [247]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_model, y, test_size=0.1, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(87, 2) (87,)
(10, 2) (10,)


In [248]:
# average survival in train and test sets
print(f'Mean y in train set: {round(np.mean(y_train), 3)}')
print(f'Mean y in test set: {round(np.mean(y_test), 3)}')

Mean y in train set: 0.218
Mean y in test set: 0.2


## Baseline Model

In [213]:
# import function
from sklearn.dummy import DummyClassifier

In [214]:
# create model
# because mean y in train = 0.234 (shown above), this "most frequent" model will predict y=0 for all test observations
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [215]:
# train model
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [216]:
print(f'Score for baseline model: {round(model_dummy.score(X_test, y_test), 2)}')

Score for baseline model: 0.8


In [217]:
model_dummy.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [218]:
# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [219]:
# confusion matrix
print(f'Confusion matrix for baseline model: \n {confusion_matrix(y_test, model_dummy.predict(X_test))}')

Confusion matrix for baseline model: 
 [[8 0]
 [2 0]]


In [220]:
# precision and recall scores
print(f'Precision for baseline model: {round(precision_score(y_test, model_dummy.predict(X_test)), 2)}')
print(f'Recall for baseline model: {round(recall_score(y_test, model_dummy.predict(X_test)), 2)}')

Precision for baseline model: 0.0
Recall for baseline model: 0.0


  'precision', 'predicted', average, warn_for)


## Logistic Regression Model

In [221]:
# import function
from sklearn.linear_model import LogisticRegression

In [222]:
# create model
model_lr_1 = LogisticRegression(random_state=0)

In [223]:
# train model
model_lr_1.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [224]:
# evaluate model
print(f'Score for logistic regression - version 1: {round(model_lr_1.score(X_test, y_test), 4)}')

Score for logistic regression - version 1: 0.9


In [225]:
print(f'Intercept coefficient is: {model_lr_1.intercept_}')
print(f'Slope coefficients are: {model_lr_1.coef_}')

Intercept coefficient is: [-0.6957814]
Slope coefficients are: [[-0.6957814   1.08641739  0.94667516]]


In [226]:
model_lr_1.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [227]:
y_test

Obs
27    0
85    1
3     0
56    0
69    0
82    0
17    0
79    0
55    1
67    0
Name: Y_HighGradeCancer, dtype: int64

In [228]:
model_lr_1.predict(X_test) == y_test

Obs
27     True
85    False
3      True
56     True
69     True
82     True
17     True
79     True
55     True
67     True
Name: Y_HighGradeCancer, dtype: bool

## STATSMODELS Library

### Full Logistics Model

In [229]:
import statsmodels.api as sm

In [249]:
X_model = sm.add_constant(X_model)
model = sm.Logit(y, X_model)

In [250]:
results = model.fit()

Optimization terminated successfully.
         Current function value: 0.353732
         Iterations 7


In [251]:
### full model statistical output
print(results.summary2(alpha=0.10))

                          Results: Logit
Model:              Logit             Pseudo R-squared: 0.323     
Dependent Variable: Y_HighGradeCancer AIC:              74.6239   
Date:               2020-10-08 00:46  BIC:              82.3481   
No. Observations:   97                Log-Likelihood:   -34.312   
Df Model:           2                 LL-Null:          -50.676   
Df Residuals:       94                LLR p-value:      7.8177e-08
Converged:          1.0000            Scale:            1.0000    
No. Iterations:     7.0000                                        
--------------------------------------------------------------------
            Coef.    Std.Err.      z      P>|z|     [0.05     0.95] 
--------------------------------------------------------------------
const      -1.4285     0.3407   -4.1932   0.0000   -1.9889   -0.8682
PSALevel    2.7969     0.7660    3.6513   0.0003    1.5369    4.0568
Age         0.6828     0.3803    1.7954   0.0726    0.0573    1.3084

