# Chapter 2: The Pandas essentials for data analysis

In [1]:
import pandas as pd

## Get the data

### Read a CSV file from a website into a DataFrame

In [2]:
url = 'https://data.cdc.gov/api/views/v6ab-adf5/rows.csv?accessType=DOWNLOAD'
mortality_data = pd.read_csv(url)

In [3]:
mortality_data.info()

mortality_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Year        476 non-null    int64  
 1   Age Group   476 non-null    object 
 2   Death Rate  476 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 11.3+ KB


Unnamed: 0,Year,Age Group,Death Rate
0,1900,1-4 Years,1983.8
1,1901,1-4 Years,1695.0
2,1902,1-4 Years,1655.7
3,1903,1-4 Years,1542.1
4,1904,1-4 Years,1591.5
...,...,...,...
471,2014,15-19 Years,45.5
472,2015,15-19 Years,48.3
473,2016,15-19 Years,51.2
474,2017,15-19 Years,51.5


In [4]:
mortality_data.Year = pd.to_datetime(mortality_data.Year, format = "%Y")

mortality_data

Unnamed: 0,Year,Age Group,Death Rate
0,1900-01-01,1-4 Years,1983.8
1,1901-01-01,1-4 Years,1695.0
2,1902-01-01,1-4 Years,1655.7
3,1903-01-01,1-4 Years,1542.1
4,1904-01-01,1-4 Years,1591.5
...,...,...,...
471,2014-01-01,15-19 Years,45.5
472,2015-01-01,15-19 Years,48.3
473,2016-01-01,15-19 Years,51.2
474,2017-01-01,15-19 Years,51.5


In [5]:
mortality_data.set_index('Year', inplace = True)

mortality_data = mortality_data.rename(columns = {'Age Group':'Age_Group', 'Death Rate':'Death_Rate'})

mortality_data

Unnamed: 0_level_0,Age_Group,Death_Rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1900-01-01,1-4 Years,1983.8
1901-01-01,1-4 Years,1695.0
1902-01-01,1-4 Years,1655.7
1903-01-01,1-4 Years,1542.1
1904-01-01,1-4 Years,1591.5
...,...,...
2014-01-01,15-19 Years,45.5
2015-01-01,15-19 Years,48.3
2016-01-01,15-19 Years,51.2
2017-01-01,15-19 Years,51.5


In [6]:
dateRange = pd.date_range('01/01/1900', '01/01/2018', freq = 'YS')

In [7]:

mortality_data.query('Age_Group == "1-4 Years"').reindex(dateRange)



Unnamed: 0,Age_Group,Death_Rate
1900-01-01,1-4 Years,1983.8
1901-01-01,1-4 Years,1695.0
1902-01-01,1-4 Years,1655.7
1903-01-01,1-4 Years,1542.1
1904-01-01,1-4 Years,1591.5
...,...,...
2014-01-01,1-4 Years,24.0
2015-01-01,1-4 Years,24.9
2016-01-01,1-4 Years,25.3
2017-01-01,1-4 Years,24.3


In [8]:
mortality_data.resample(rule='5YS').mean()

Unnamed: 0_level_0,Death_Rate
Year,Unnamed: 1_level_1
1900-01-01,713.44
1905-01-01,624.815
1910-01-01,515.42
1915-01-01,534.995
1920-01-01,399.045
1925-01-01,332.93
1930-01-01,267.375
1935-01-01,218.98
1940-01-01,154.49
1945-01-01,112.84


In [9]:
mortality_data

Unnamed: 0_level_0,Age_Group,Death_Rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1900-01-01,1-4 Years,1983.8
1901-01-01,1-4 Years,1695.0
1902-01-01,1-4 Years,1655.7
1903-01-01,1-4 Years,1542.1
1904-01-01,1-4 Years,1591.5
...,...,...
2014-01-01,15-19 Years,45.5
2015-01-01,15-19 Years,48.3
2016-01-01,15-19 Years,51.2
2017-01-01,15-19 Years,51.5


In [10]:
 rolling_mortality = mortality_data.set_index('Age_Group', append=True) \
    .rolling(window=5, min_periods=1).mean() 

rolling_mortality

Unnamed: 0_level_0,Unnamed: 1_level_0,Death_Rate
Year,Age_Group,Unnamed: 2_level_1
1900-01-01,1-4 Years,1983.800000
1901-01-01,1-4 Years,1839.400000
1902-01-01,1-4 Years,1778.166667
1903-01-01,1-4 Years,1719.150000
1904-01-01,1-4 Years,1693.620000
...,...,...
2014-01-01,15-19 Years,47.160000
2015-01-01,15-19 Years,46.940000
2016-01-01,15-19 Years,47.400000
2017-01-01,15-19 Years,48.260000
