## Workshop 3 - Exploratory Data Analysis with Hospital Admissions Data
### Justin Ritenburgh
### MDST Winter 2020

### Import pandas, matplotlib.pyplot, stats from scipy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

### Load the Hospital Admissions dataset into a dataframe

In [None]:
filepath = 'HospitalAdmissionsData.csv'
admissions = pd.read_csv(filepath)

### What are the column headers or features in this data set?  

In [None]:
admissions.columns

### How many features are floats? How many are integers?

In [None]:
admissions.info()

### What are the possible 'values' for insurance type?

In [None]:
admissions.Insurance_Type.unique()

### Show the summary statistics for the admission length

In [None]:
admissions.AdmissionLengthDays.describe()

### What is the most common admission type, insurance type, religion type, race,and diagnosis (Dx)?

In [None]:
admissions.describe(include = 'object')

### Show a histogram for admission days (with appropriate labels, titles, etc), andalso show a histogram for admission days on a log scale. Describe what you see.
The vast majority of hospital stays are only one day, and decreases exponentially

In [None]:
admissions.AdmissionLengthDays.hist()
plt.title("Admission Length Histogram")
plt.xlabel("Length (Days)")
plt.ylabel("Count")

In [None]:
admissions.AdmissionLengthDays.hist(log = True)
plt.title("Admission Length Histogram (log-scale)")
plt.xlabel("Length (Days)")
plt.ylabel("Count")

### Compare the average length of admission between those who died versus thosewho did not die. Show a visualization, with appropriate labels, titles, etc.

In [None]:
admissions.groupby("Death_1")["AdmissionLengthDays"].mean()

In [None]:
admissions.boxplot("AdmissionLengthDays", by = "Death_1")
plt.suptitle('')
plt.title("Boxplot of Admission Lengths by Death Status")
plt.xlabel("Death Status")
plt.ylabel("Length (Days)")

### Describe the association between death and insurance type and show avisualization or test.
The $\chi^2$ test with the null hypothesis that there is no association rejects the null hypothesis

In [None]:
deathInsuranceType = pd.crosstab(admissions.Death_1, admissions.Insurance_Type)
chi = stats.chi2_contingency(deathInsuranceType)
print(chi)

In [None]:
x = np.linspace(0, 10, 100)
y1, y2, y3 = np.cos(x), np.cos(x + 1), np.cos(x + 2)
names = ['Signal 1', 'Signal 2', 'Signal 3']

In [None]:
fig, axs = plt.subplots(3)
axs[0].plot(x, y1)
axs[0].set_title(names[0])
axs[1].plot(x, y2)
axs[1].set_title(names[1])
axs[2].plot(x, y3)
axs[2].set_title(names[2])
plt.tight_layout()