# Data Exploration

The objective of this notebook is to analyze the raw data and perform exploratory analysis that will aid in the construction of model building and post-analysis. 

---------------------------------------

In [450]:
# Module imports
import os
import glob
import numpy as np
import pandas as pd
from scipy import stats
import math

In [451]:
# Constants
data_url = "../../data-science-bowl-2017-data/"
sample_data = 'sample_images/'
labels_data = 'stage1_labels.csv'
train_data = 'stage1/'


In [452]:
# Validating if files exist

def verify_location(loc):
    loc = data_url + loc
    if os.path.isdir(loc) or os.path.isfile(loc) :
        print('Found and verified location: ' + loc)
    else:
        raise Exception('Failed to verify location: ' + loc)
    return loc
    
sample_data = verify_location(sample_data)
train_data = verify_location(train_data)
labels = verify_location(labels_data)

Found and verified location: ../../data-science-bowl-2017-data/sample_images/
Found and verified location: ../../data-science-bowl-2017-data/stage1/
Found and verified location: ../../data-science-bowl-2017-data/stage1_labels.csv


In [453]:
# Extracting patient scan data
def folder_explorer(folder):
    patient_info = {}
    for d in os.listdir(folder):
        patient_info[d] = int(len(os.listdir(folder + d)))
    return patient_info

# Sample Data
patient_scans_sample = folder_explorer(sample_data)
df_patient_scans_sample = pd.DataFrame(list(patient_scans_sample.items()), 
                             columns=["id", "scans-per-patient"])
patient_scans_sample_describe = pd.DataFrame.describe(df_patient_scans_sample)

# Train Data
patient_scans_train = folder_explorer(train_data)
df_patient_scans_train = pd.DataFrame(list(patient_scans_train.items()), 
                             columns=["id", "scans-per-patient"])
patient_scans_train_describe = pd.DataFrame.describe(df_patient_scans_train)


print("Descriptive statistics for sample data:")
print(patient_scans_sample_describe)

print("\n")

print("Descriptive statistics for train data:")
print(patient_scans_train_describe)





Descriptive statistics for sample data:
       scans-per-patient
count           20.00000
mean           180.20000
std             75.23619
min            110.00000
25%            131.75000
50%            167.50000
75%            197.75000
max            435.00000


Descriptive statistics for train data:
       scans-per-patient
count        1595.000000
mean          178.921630
std            67.070653
min            94.000000
25%           138.500000
50%           160.000000
75%           190.000000
max           541.000000


In [454]:
# Adding Label Data

df_labels = pd.read_csv(labels)
df_labels.head()

Unnamed: 0,id,cancer
0,0015ceb851d7251b8f399e39779d1e7d,1
1,0030a160d58723ff36d73f41b170ec21,0
2,003f41c78e6acfa92430a057ac0b306e,0
3,006b96310a37b36cccb2ab48d10b49a3,1
4,008464bb8521d09a42985dd8add3d0d2,1


In [455]:
result_sample = pd.merge(df_patient_scans_sample, df_labels, how="left", on=['id'])
result_train = pd.merge(df_patient_scans_train, df_labels, how="left", on=['id'])

print(result_sample)
print(result_train)



                                  id  scans-per-patient  cancer
0   0acbebb8d463b4b9ca88cf38431aac69                203     1.0
1   0b20184e0cd497028bdd155d9fb42dc9                196     NaN
2   0a0c32c9e08cc2ea76a71649de56be6d                133     0.0
3   0c9d8314f9c69840e25febabb1229fa4                221     0.0
4   0ca943d821204ceb089510f836a367fd                147     0.0
5   0c60f4b87afcb3e2dfa65abbbf3ef2f9                136     1.0
6   0c37613214faddf8701ca41e6d43f56e                164     1.0
7   0bd0e3056cbf23a1cb7f0f0b18446068                280     0.0
8   0c0de3749d4fe175b7a5098b060982a1                123     1.0
9   0a099f2549429d29b32f349e95fb2244                128     0.0
10  0ddeb08e9c97227853422bd71a2a695e                171     0.0
11  0c59313f52304e25d5a7dcf9877633b1                244     0.0
12  0c98fcb55e3f36d0c2b6507f62f4c5f1                180     0.0
13  0de72529c30fe642bc60dcb75c87f6bd                113     0.0
14  0d2fcf787026fece4e57be167d079383    

In [462]:
# NaN labels?
null_ids_sample = result_sample.loc[result_sample['cancer'].isnull(), :]
null_ids_train = result_train.loc[result_train['cancer'].isnull(), :]

print(null_ids_sample)
print("/n")
print(null_ids_train)



                                 id  scans-per-patient  cancer
1  0b20184e0cd497028bdd155d9fb42dc9                196     NaN
/n
                                    id  scans-per-patient  cancer
4     1e62be2c3b6430b78ce31a8f023531ac                457     NaN
19    f7c387290d7e3074501eac167c849000                173     NaN
24    6993396b31078993e13cf9c0a6fd470b                162     NaN
25    5ce91933688cc8400105bf640ac11535                126     NaN
27    8be7a7cc747365030bee8297221ab5bc                164     NaN
37    cdb53f3be6d8cce07fa41c833488d8a5                301     NaN
50    8bb7dd5fbfa5ecb95552d9c587f2fea5                135     NaN
59    c2ef34cc347bc224b5a123426009d027                136     NaN
68    eb9db3f740f8e153e85f83c57bc4e522                250     NaN
72    48ab0b98fc7789304c21430978624f32                152     NaN
81    50cdacec399071cf70d8badd2511d0b3                171     NaN
87    580cffecce8d3d53cde1abb922adf21a                149     NaN
100   bdc2daa

TODO:
* Mean and varince for scans for patients (will help if a min trashhold need to be built)
* If sample data is representative of the total data
    * Sample total data and above analysis 