# Data

In [1]:

import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline
# Seaborn
sns.set_palette("Blues_r")
sns.set_style("whitegrid")
FIGSIZE = (12,8)
DARKBLUE = "#1C3879"
LIGHTBLUE = "steelblue"
# Pandas
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 20)
pd.set_option("display.max_colwidth", 100)
# Directories and Filepaths
TRAIN_IMAGES_DIR = "data/raw/train_images"
TEST_IMAGES_DIR = "data/raw/test_images"
SEGMENTATIONS_DIR = "data/raw/segmentations"
training_metadata_FILEPATH = "data/raw/train.csv"
TEST_METADATA_FILEPATH = "data/raw/test.csv"
TRAIN_BOUNDING_BOXES_FILEPATH = "data/raw/train_bounding_boxes.csv"


## Data Profile
### Training Metadata

In [3]:
training_metadata = pd.read_csv(training_metadata_FILEPATH, index_col=False)

## Augmented Data
- Craniovertebral Region (C1-C2) 
- Mid-Cervical Region (C3-C6)
- Total Fractures


In [None]:
display(HTML('<h3>Training Metadata Summary</h3>'))
training_metadata.info()

In [None]:
display(HTML('<h3>Training Metadata Sample</h3>'))
training_metadata.head()

In [None]:
display(HTML('<h3>Patient Fracture and Vertebrae Distributions</h3>'))
# Patient Fracture Distribution
fig, ax1 = plt.subplots(figsize=FIGSIZE)
ax1 = sns.countplot(data=training_metadata, x='patient_overall', hue="patient_overall", dodge=False)
for container in ax1.containers:
    _ = ax1.bar_label(container)
ax1.set_title('Patient / Fracture Distribution')
ax1.set_xlabel("Patient Overall Diagnosis")
ax1.set_ylabel("Patients")
ax1.set_xticklabels(['No Fracture', 'Fracture(s)'])
ax1.set_ylim([0,1200])
plt.legend(labels=["No Fracture", "Fracture(s)"]);

In [None]:
display(HTML('<h3>Patients with Fracture Vertebrae</h3>'))
fig, ax = plt.subplots(figsize=FIGSIZE)
df = training_metadata[['C1','C2','C3','C4','C5','C6','C7']].sum(axis=0).to_frame()
df.columns = ["Patients with Fracture"]
df['Patients'] = training_metadata.shape[0]
df['Patients w/o Fracture'] = df['Patients'] - df["Patients with Fracture"]
df
df[["Patients with Fracture",'Patients w/o Fracture']].plot(kind='bar', stacked=True, color=[DARKBLUE, LIGHTBLUE],ax=ax)
for container in ax.containers:
    ax.bar_label(container)
ax.set_title("Patients by Vertebrae Fracture");


In [None]:
display(HTML('<h3>Distribution of Fractures by Vertebrae</h3>'))
fig, ax = plt.subplots(figsize=FIGSIZE)
df = training_metadata[['C1','C2','C3','C4','C5','C6','C7']].sum(axis=0).to_frame()
df.columns = ["Fractures"]
df['Total Fractures'] = training_metadata['total_fractures'].sum()
df['Other Fractures'] = df['Total Fractures'] - df["Fractures"]
df
df[["Fractures",'Other Fractures']].plot(kind='bar', stacked=True, color=[DARKBLUE, LIGHTBLUE],ax=ax)
for container in ax.containers:
    ax.bar_label(container)
ax.set_title("Distribution of Fractures by Vertebrae");


In [None]:
df = training_metadata['total_fractures']
fig, ax = plt.subplots(figsize=FIGSIZE)
ax = sns.countplot(x=df, palette='Blues_r')
for container in ax.containers:
    ax.bar_label(container)
ax.set_title("Number of Patients by Number of Fracture\nn=2019")
ax.set_xlabel("Number of Fractures")
ax.set_ylabel("Number of Patients");

In [7]:
d = {}
d['Columns'] = training_metadata.columns.values
d['Dtype'] = training_metadata.dtypes.values
d['Non-Null Count'] = training_metadata.count().values
d['Null Count'] = training_metadata.isnull().sum(axis=0).values
d['Minimum'] = training_metadata.min(axis=0, numeric_only=None).values
d['Maximum'] = training_metadata.max(axis=0, numeric_only=None).values
d['Num Unique'] = training_metadata.nunique(axis=0).values
d['Memory Usage'] = training_metadata.memory_usage(deep=True).values
print(d)

{'Columns': array(['StudyInstanceUID', 'patient_overall', 'C1', 'C2', 'C3', 'C4',
       'C5', 'C6', 'C7'], dtype=object), 'Dtype': array([dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64')], dtype=object), 'Non-Null Count': array([2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019]), 'Null Count': array([0, 0, 0, 0, 0, 0, 0, 0, 0]), 'Minimum': array(['1.2.826.0.1.3680043.10001', 0, 0, 0, 0, 0, 0, 0, 0], dtype=object), 'Maximum': array(['1.2.826.0.1.3680043.9997', 1, 1, 1, 1, 1, 1, 1, 1], dtype=object), 'Num Unique': array([2019,    2,    2,    2,    2,    2,    2,    2,    2]), 'Memory Usage': array([   128, 164874,  16152,  16152,  16152,  16152,  16152,  16152,
        16152,  16152])}
