## Advanced Pandas, Basic Python Viz and Exercise

### Exercise
1. Use the same dataframes you generated or copied to this diectory from the 03 Exercise:
    1. Extract of any SQL database 
    1. Imported file from your capstone work
    1. Any other data from other exercises you have aleady used
1. Import into Pandas
1. Generate at least three additional dataframes with analytcis introduced in this notebook
    1. Melt
    1. Pivot
    1. Quantile
    1. Aggregations
1. Build at least three  visualizations supported by Pandas directly using prior analytics from the 03 Exercise or new analytics from this exercise

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd

## Get the data

### Read a CSV file from a website into a DataFrame

In [None]:
url = 'https://data.cdc.gov/api/views/v6ab-adf5/rows.csv?accessType=DOWNLOAD'
mortality_data = pd.read_csv(url)

In [None]:
mortality_data.plot.line(x="Year", y = "Death Rate")

In [None]:
mortality_data

### Save and restore a DataFrame

In [None]:
mortality_data.to_pickle('mortality_data.pkl')

In [None]:
mortality_data = pd.read_pickle('mortality_data.pkl')

In [None]:
mortality_data.head()

## Examine and clean the data

In [None]:
print("Index:  ", mortality_data.index)
print("Columns:", mortality_data.columns)
print("Size:   ", mortality_data.size)
print("Shape:  ", mortality_data.shape)

### Use the columns attribute to replace spaces with nothing


In [None]:
mortality_data.columns = mortality_data.columns.str.replace(' ', '')
mortality_data.columns

In [None]:
mortality_data.head()

In [None]:
mortality_data.DeathRate.hist()

In [None]:
mortality_data.describe(include='O')

In [None]:
mortality_data.describe().T

## Access the data

In [None]:
mortality_data.query("Year == 2000 and AgeGroup != '1-4 Years'")

In [None]:
mortality_data.query('Year == 1900 or Year == 2000').head()

In [None]:
# use backticks if a column name contains spaces
mortality_data.query('Year == 2000 and `AgeGroup` != "1-4 Years"')

### Access a subset of rows and columns

In [None]:
mortality_data.query('Year == 1900').DeathRate.head()

In [None]:
mortality_data.query('Year == 1900')['DeathRate'].head()

In [None]:
mortality_data.query('Year == 1900')[['AgeGroup','DeathRate']].head()

### Apply statistical methods

In [None]:
mortality_data.DeathRate.mean()

In [None]:
mortality_data.DeathRate.median()

In [None]:
mortality_data[['AgeGroup','DeathRate']].max()

In [None]:
mortality_data.count()

In [None]:

mort_dist = mortality_data.quantile([i/20 for i in range(0,20)])
mort_dist["quantile"] = mort_dist.index
mort_dist


In [None]:
mort_dist.plot.line(y="quantile", x = "DeathRate")

In [None]:
mortality_data["CumDeathRate"] = mortality_data.DeathRate.cumsum()

In [None]:
mortality_data.plot.line( y = "CumDeathRate")

### Use Python for column arithmetic

In [None]:
mortality_data['MeanCentered'] = \
    mortality_data.DeathRate - mortality_data.DeathRate.mean()

In [None]:
#mortality_data.head(4)
mortality_data.MeanCentered.describe()

In [None]:
mortality_data['DeathRate'] = mortality_data.DeathRate / 100000

In [None]:
mortality_data.head(4)

## Shape the data

### Set and use an index

In [None]:
mortality_data = mortality_data.set_index('Year')
mortality_data.head(2)

In [None]:
mortality_data.reset_index(inplace=True)

In [None]:
# NOTE: the following line of code causes ValueError: Index has duplicate keys
# mortality_data = mortality_data.set_index('Year', verify_integrity=True)
mortality_data

In [None]:
mortality_data = mortality_data.set_index(
    ['Year','AgeGroup'], verify_integrity=True)
mortality_data.head(2)

In [None]:
mortality_data.reset_index(inplace=True)
mortality_data.head(2)

### Pivot the data

In [None]:
mortality_wide = mortality_data.pivot(
    index='Year', columns='AgeGroup', values=['DeathRate','MeanCentered'])
mortality_wide.head(3)

In [None]:
mortality_wide = mortality_data.pivot(
    index=['Year','MeanCentered'], columns='AgeGroup', values='DeathRate')
mortality_wide.head(3)    

In [None]:
mortality_wide = mortality_data.pivot(
    index='Year', columns='AgeGroup')
mortality_wide.head(3)

### Melt the data

In [None]:
mortality_wide = mortality_data.pivot(
    index='Year', columns='AgeGroup', values='DeathRate')
mortality_wide.head(3)

In [None]:
# get starting data


# save to Excel format to remove indexes
mortality_wide.to_excel('mortality_wide.xlsx')
mortality_wide = pd.read_excel('mortality_wide.xlsx')

mortality_wide.head(4)

In [None]:
mortality_long = mortality_wide.melt(
    id_vars = 'Year',
    value_vars=['1-4 Years','5-9 Years'],
    var_name ='AgeGroup',
    value_name='DeathRate')

with pd.option_context('display.max_rows', 4):
    display(mortality_long)

## Analyze the data

### Group the data

In [None]:
mortality_data.groupby('AgeGroup').mean()

In [None]:
mortality_data.groupby('Year').median().head(4)

In [None]:
mortality_data.groupby(['Year','AgeGroup']).count().head()

### Aggregate the data

In [None]:
mortality_data[['AgeGroup','DeathRate']].groupby('AgeGroup').agg(['mean','median','min','max','std'])

In [None]:
AgeGS = mortality_data.groupby("AgeGroup")
type(AgeGS)

In [None]:
mortality_data.groupby('AgeGroup')['DeathRate'] \
    .agg(['mean','median','std','nunique'])

In [None]:
mortality_data.groupby('Year')['DeathRate'] \
    .agg(['mean','median','std','min','max','var','nunique']).head(3)

In [None]:
def fun_1q(foo): return foo.quantile(0.75)

fun_1q(mortality_data.DeathRate)

## Visualize the data

In [None]:
mortality_data.pivot(index='Year',columns='AgeGroup')['DeathRate']

In [None]:
mortality_data.pivot(index='Year',columns='AgeGroup')['DeathRate'].plot()

In [None]:
mortality_data.groupby('AgeGroup')['DeathRate'] \
    .agg(['mean','median','std'])

In [None]:
mortality_data.groupby('AgeGroup')['DeathRate'] \
    .agg(['mean','median','std']).plot.barh()