# DAY 80
Dr Semmelweis research of 1861

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
from scipy import stats

pd.options.display.float_format = '{:,.2f}'.format #formatacao textual
register_matplotlib_converters() #locators for ticks on the time axis
#Locators
years = mdates.YearLocator()
months = mdates.MonthLocator()
years_fmt = mdates.DateFormatter('%Y')

yearly_data = pd.read_csv('annual_deaths_by_clinic.csv')
monthly_data = pd.read_csv('monthly_deaths.csv',parse_dates=['date'])# parse_dates = DateTime conversion

In [None]:
#yearly_data.isna()
#yearly_data.duplicated()
#yearly_data.describe()
#yearly_data.info()

#monthly_data.describe()
#monthly_data.info()
#monthly_data.isna()
#monthly_data.duplicated()

In [None]:
dth_prob = yearly_data.deaths.sum() / yearly_data.births.sum() * 100
print(f'Chances of the mother dying in the 1840s in Vienna: {dth_prob:.3}%')

plt.figure(figsize=(14,8), dpi=200)
plt.title('Number of Monthly Births and Deaths', fontsize=15)
plt.ylabel('Mortes')
axis1 = plt.gca()
axis2 = axis1.twinx()

axis1.grid(color='gray', linestyle='--')

axis1.plot(monthly_data.date, #NASCIMENTOS = BLUE
         monthly_data.births,
         color='skyblue',
         linewidth=3)

axis2.plot(monthly_data.date,  #MORTES = RED
         monthly_data.deaths,
         color='red',
         linewidth=2,
         linestyle='--')

plt.show()

In [None]:
#before graph with ticks in time
plt.figure(figsize=(14,8), dpi=200)
plt.title('Monthly Births and Deaths', fontsize=16)
plt.yticks(fontsize=14)
plt.xticks(fontsize=14, rotation=45)

ax1 = plt.gca()
ax2 = ax1.twinx()

ax1.set_ylabel('Births', color='green', fontsize=18)
ax2.set_ylabel('Deaths', color='red', fontsize=18)

# Use Locators
ax1.set_xlim([monthly_data.date.min(), monthly_data.date.max()])
ax1.xaxis.set_major_locator(years)
ax1.xaxis.set_major_formatter(years_fmt)
ax1.xaxis.set_minor_locator(months)

ax1.grid(color='grey', linestyle='--')

ax1.plot(monthly_data.date,
         monthly_data.births,
         color='lightgreen',
         linewidth=3)

ax2.plot(monthly_data.date,
         monthly_data.deaths,
         color='red',
         linewidth=2,
         linestyle='--')

plt.show()

In [None]:
#divisao de pacientes por clinica
# born_graph = px.line(yearly_data,
#                x='year',
#                y='births', #main arg
#                color='clinic',
#                title='Yearly Births by Clinic')

# born_graph.show()

# dth_graph = px.line(yearly_data,
#                x='year',
#                y='deaths', #main arg
#                color='clinic',
#                title='Yearly Deaths by Clinic')

# dth_graph.show()

both_graph = px.line(
    yearly_data,
    x='year',
    y=['births','deaths'],
    color='clinic',
    markers=True,
    title='Yearly Births and Deaths by Clinic'
)

both_graph.show()

In [None]:
#porcentagem de mortalidade
yearly_data['pct_deaths'] = yearly_data.deaths / yearly_data.births

clinic_1 = yearly_data[yearly_data.clinic == 'clinic 1']
avg_c1 = clinic_1.deaths.sum() / clinic_1.births.sum() * 100
print(f'Average death rate in clinic 1 is {avg_c1:.3}%.')

clinic_2 = yearly_data[yearly_data.clinic == 'clinic 2']
avg_c2 = clinic_2.deaths.sum() / clinic_2.births.sum() * 100
print(f'Average death rate in clinic 2 is {avg_c2:.3}%.')

line_chart = px.line(yearly_data,
               x='year',
               y='pct_deaths',
               color='clinic',
               title='Yearly Deaths by Clinic')

line_chart.show()

In [None]:
#analise do efeito de lavar as maos na mortalidade
monthly_data['pct_deaths'] = monthly_data.deaths/monthly_data.births
# Date when handwashing was obligated
handwashing_start = pd.to_datetime('1847-06-01')

before_washing = monthly_data[monthly_data.date < handwashing_start]
after_washing = monthly_data[monthly_data.date >= handwashing_start]
bw_rate = before_washing.deaths.sum() / before_washing.births.sum() * 100
aw_rate = after_washing.deaths.sum() / after_washing.births.sum() * 100
print(f'Average death rate before 1847 was {bw_rate:.4}%')
print(f'Average death rate AFTER 1847 was {aw_rate:.3}%')

roll_df = before_washing.set_index('date')
roll_df = roll_df.rolling(window=6).mean()

In [None]:
#unindo os dados do bloco anterior para demonstracao em grafico abaixo
plt.figure(figsize=(14,8), dpi=200)
plt.title('Percentage of Monthly Deaths over Time', fontsize=15)
plt.yticks(fontsize=14)
plt.xticks(fontsize=14, rotation=45)

plt.ylabel('Death Percentage', color='red', fontsize=16)

main_axis = plt.gca()
main_axis.xaxis.set_major_locator(years)
main_axis.xaxis.set_major_formatter(years_fmt)
main_axis.xaxis.set_minor_locator(months)
main_axis.set_xlim([monthly_data.date.min(), monthly_data.date.max()])

plt.grid(color='grey', linestyle='--')

ma_line, = plt.plot(roll_df.index,
                    roll_df.pct_deaths,
                    color='red',
                    linewidth=3,
                    linestyle='--',
                    label='6 month Moving Average')
##########################################################
bw_line, = plt.plot(before_washing.date,
                    before_washing.pct_deaths,
                    color='orange',
                    linewidth=1,
                    linestyle='--',
                    label='Before Handwashing')
#########################################################3
aw_line, = plt.plot(after_washing.date,
                    after_washing.pct_deaths,
                    color='purple',
                    linewidth=3,
                    marker='o',
                    label='After Handwashing')

plt.legend(handles=[ma_line, bw_line, aw_line],fontsize=18)

plt.show()

In [None]:
#com base nos dados do grafico passado é possível concluir que:

avg_before = before_washing.pct_deaths.mean() * 100
print(f'Death chance during childbirth BEFORE handwashing: {avg_before:.3}%.')

avg_after = after_washing.pct_deaths.mean() * 100
print(f'Death chance during childbirth AFTER handwashing: {avg_after:.3}%.')

mean_diff = avg_before - avg_after
print(f'Handwashing reduced the proportion of deaths by {mean_diff:.3}%')

times = avg_before / avg_after
print(f'A {times:.2}x improvement')

In [None]:
#relacao da condicao com a porcentagem de mortes
monthly_data['washing_hands'] = np.where(monthly_data.date < handwashing_start, 'No', 'Yes') #compara a data e completa os valores

box_death_graph = px.box(monthly_data,
             x='washing_hands',
             y='pct_deaths',
             color='washing_hands',
             title='Stats Change with Handwashing')

box_death_graph.update_layout(xaxis_title='Washing Hands Condition',
                  yaxis_title='Monthly Deaths Percentage',)

box_death_graph.show()


In [None]:
#histograma + boxplot com base nos dados do bloco anterior
hist = px.histogram(monthly_data,
                   x='pct_deaths',
                   color='washing_hands',
                   nbins=30,
                   opacity=0.6,
                   barmode='overlay',
                   histnorm='percent',
                   marginal='box',)

hist.update_layout(xaxis_title='Monthly Deaths Proportion',
                   yaxis_title='Quantity',)

hist.show()

In [None]:
#Kernel Density Estimate KDE

plt.figure(dpi=200)
sns.kdeplot(before_washing.pct_deaths,
            fill=True,
            clip=(0,1))
sns.kdeplot(after_washing.pct_deaths,
            fill=True,
            clip=(0,1))
plt.title('Estimate Distribution of M Death Rate Bef and Aft Handwashing')
plt.xlim(0, 0.40)
plt.show()

In [None]:
#determine if there is a significant difference between the means of two independent groups
#If the p-value is less than 0.01, we reject the null hypothesis and conclude that there is a statistically significant difference between the two groups

t_stat, p_value = stats.ttest_ind(a=before_washing.pct_deaths, b=after_washing.pct_deaths)
print(f'p-value is {p_value:.10f}')
print(f't-statistic is {t_stat:.4f}')

CONCLUSION:

<img src=https://i.imgur.com/rvjNVzQ.gif>