# Exploring and Processing Data

In [83]:
# imports
import pandas as pd
import numpy as np
import os

## Import Data

In [84]:
# set path to raw data
raw_data_path = os.path.join(os.path.pardir, 'data', 'raw')
data_file_path = os.path.join(raw_data_path, 'APPENC05.txt')

In [90]:
# read the default .txt file and print it
f = open(data_file_path, 'r')
print(f.read(300)) # print the first 300 characters
f.close()

   1     0.651    0.5599    15.959    50    0.0000    0    0.0000    6
   2     0.852    0.3716    27.660    58    0.0000    0    0.0000    7
   3     0.852    0.6005    14.732    74    0.0000    0    0.0000    7
   4     0.852    0.3012    26.576    58    0.0000    0    0.0000    6
   5     1.448  


In [110]:
# create pandas dataframe with column headers
cols = [
    'obs', 'psa_level','cancer_vol', 'weight',
    'age', 'benign_prostatic_hyperlasia', 'seminal_vesicle_invasion',
    'capsular_penetration', 'gleason_score'
    ]

df = pd.read_fwf(data_file_path, names=cols, index_col='obs')

## Basic Structure

In [140]:
# creating a new binary response variable Y, called high-grade cancer, by letting Y=1 if Gleason score equals 8, 
# and Y=0 otherwise (i.e., if Gleason score equals 6 or 7)

df['Y_high_grade_cancer'] = np.where(df['gleason_score'] == 8, 1, 0)

In [141]:
# use .head() to view the first 5 rows
df.head()

Unnamed: 0_level_0,psa_level,cancer_vol,weight,age,benign_prostatic_hyperlasia,seminal_vesicle_invasion,capsular_penetration,gleason_score,Y_high_grade_cancer
obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.651,0.5599,15.959,50,0.0,0,0.0,6,0
2,0.852,0.3716,27.66,58,0.0,0,0.0,7,0
3,0.852,0.6005,14.732,74,0.0,0,0.0,7,0
4,0.852,0.3012,26.576,58,0.0,0,0.0,6,0
5,1.448,2.117,30.877,62,0.0,0,0.0,6,0


In [121]:
# use .tail() to view the last 5 rows
df.tail()

Unnamed: 0_level_0,psa_level,cancer_vol,weight,age,benign_prostatic_hyperlasia,seminal_vesicle_invasion,capsular_penetration,gleason_score,Y
obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
93,80.64,16.9455,48.424,68,0.0,1,3.7434,8,1
94,107.77,45.6042,49.402,44,0.0,1,8.7583,8,1
95,170.716,18.3568,29.964,52,0.0,1,11.7048,8,1
96,239.847,17.8143,43.38,68,4.7588,1,4.7588,8,1
97,265.072,32.1367,52.985,68,1.5527,1,18.1741,8,1


In [123]:
# use .info() to get basic information about the dataframes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97 entries, 1 to 97
Data columns (total 9 columns):
psa_level                      97 non-null float64
cancer_vol                     97 non-null float64
weight                         97 non-null float64
age                            97 non-null int64
benign_prostatic_hyperlasia    97 non-null float64
seminal_vesicle_invasion       97 non-null int64
capsular_penetration           97 non-null float64
gleason_score                  97 non-null int64
Y                              97 non-null int32
dtypes: float64(5), int32(1), int64(3)
memory usage: 7.2 KB


In [132]:
# filter rows based on condition
gleason_eight = len(df.loc[df.gleason_score == 8, :])
gleason_not_eight = len(df.loc[df.gleason_score != 8, :])
print(f'Count of high-grade cancer: {gleason_eight}')
print(f'Count of non high-grade cancer: {gleason_not_eight}')

Count of high-grade cancer: 21
Count of non high-grade cancer: 76


## Summary Statistics

In [124]:
# use .describe() to view summary statistics for all numerical columns
df.describe()

Unnamed: 0,psa_level,cancer_vol,weight,age,benign_prostatic_hyperlasia,seminal_vesicle_invasion,capsular_penetration,gleason_score,Y
count,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0
mean,23.730134,6.998682,45.491361,63.865979,2.534725,0.216495,2.245367,6.876289,0.216495
std,40.782925,7.880869,45.705053,7.445117,3.031176,0.413995,3.783329,0.739619,0.413995
min,0.651,0.2592,10.697,41.0,0.0,0.0,0.0,6.0,0.0
25%,5.641,1.6653,29.371,60.0,0.0,0.0,0.0,6.0,0.0
50%,13.33,4.2631,37.338,65.0,1.3499,0.0,0.4493,7.0,0.0
75%,21.328,8.4149,48.424,68.0,4.7588,0.0,3.2544,7.0,0.0
max,265.072,45.6042,450.339,79.0,10.2779,1.0,18.1741,8.0,1.0
