# Data Exploration

The objective of this notebook is to analyze the raw data and perform exploratory analysis that will aid in the construction of model building and post-analysis. 

---------------------------------------

In [1]:
# Module imports
import os
import glob
import numpy as np
import pandas as pd
from scipy import stats
import math

In [2]:
# Constants
data_url = '/kaggle/dev/data-science-bowl-2017-data/'
sample_data = 'sample_images/'
labels_data = 'stage1_labels.csv'
train_data = 'stage1/'


In [3]:
# Validating if files exist

def verify_location(loc):
    loc = data_url + loc
    if os.path.isdir(loc) or os.path.isfile(loc) :
        print('Found and verified location: ' + loc)
    else:
        raise Exception('Failed to verify location: ' + loc)
    return loc
    
sample_data = verify_location(sample_data)
train_data = verify_location(train_data)
labels = verify_location(labels_data)

Found and verified location: /kaggle/dev/data-science-bowl-2017-data/sample_images/
Found and verified location: /kaggle/dev/data-science-bowl-2017-data/stage1/
Found and verified location: /kaggle/dev/data-science-bowl-2017-data/stage1_labels.csv


In [4]:
# Extracting patient scan data
def folder_explorer(folder):
    patient_info = {}
    for d in os.listdir(folder):
        patient_info[d] = int(len(os.listdir(folder + d)))
    return patient_info

# Sample Data
patient_scans_sample = folder_explorer(sample_data)
df_patient_scans_sample = pd.DataFrame(list(patient_scans_sample.items()), 
                             columns=["id", "scans-per-patient"])
patient_scans_sample_describe = pd.DataFrame.describe(df_patient_scans_sample)

# Train Data
patient_scans_train = folder_explorer(train_data)
df_patient_scans_train = pd.DataFrame(list(patient_scans_train.items()), 
                             columns=["id", "scans-per-patient"])
patient_scans_train_describe = pd.DataFrame.describe(df_patient_scans_train)


print("Descriptive statistics for sample data:")
print(patient_scans_sample_describe)

print("\n")

print("Descriptive statistics for train data:")
print(patient_scans_train_describe)





Descriptive statistics for sample data:
       scans-per-patient
count           20.00000
mean           180.20000
std             75.23619
min            110.00000
25%            131.75000
50%            167.50000
75%            197.75000
max            435.00000


Descriptive statistics for train data:
       scans-per-patient
count        1595.000000
mean          178.921630
std            67.070653
min            94.000000
25%           138.500000
50%           160.000000
75%           190.000000
max           541.000000


In [5]:
# Adding Label Data

df_labels = pd.read_csv(labels)
df_labels.head()

Unnamed: 0,id,cancer
0,0015ceb851d7251b8f399e39779d1e7d,1
1,0030a160d58723ff36d73f41b170ec21,0
2,003f41c78e6acfa92430a057ac0b306e,0
3,006b96310a37b36cccb2ab48d10b49a3,1
4,008464bb8521d09a42985dd8add3d0d2,1


In [15]:
result_sample = pd.merge(df_patient_scans_sample, df_labels, how="inner", on=['id'])
result_train = pd.merge(df_patient_scans_train, df_labels, how="inner", on=['id'])
train = result_train

## Size Analysis

In [31]:
print("size analysis:")
print("patient_data (size): " + str(df_patient_scans_train.shape))
print("unique patient_data : " + str(len(df_patient_scans_train['id'].unique())))

print("df_labels (size): " + str(df_labels.shape))
print("unique df_labels: " + str(len(df_labels['id'].unique())))

print("train (size): " + str(train.shape))
print("unique train: " + str(len(train['id'].unique())))

size analysis:
patient_data (size): (1595, 2)
unique patient_data : 1595
df_labels (size): (1397, 2)
unique df_labels: 1397
train (size): (1397, 3)
unique train: 1397
