In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '../data/celebrity_deaths_2016.csv'

df = pd.read_csv(filename,
                usecols=['dateofdeath', 'age'])

df.head()

Unnamed: 0,dateofdeath,age
0,2016-01-01,71
1,2016-01-01,74
2,2016-01-01,79
3,2016-01-01,45
4,2016-01-01,83


In [3]:
# Clean all non-integers from the "age" column
# (1) Remove all NaNs
df = df.dropna(subset=['age'])

# (2) Remove all non-digits
df = df[df['age'].str.isdigit()]
df['age'] = df['age'].astype(np.int64)

# Beyond 1

Add a new column, `day`, from the day of the month in which the celebrity died. Then create a multi-index (from `month` and `day`). What was the average age of death from Feb. 15th through July 15th?

In [4]:
# Get the month, in slice [5:7]
df['month'] = df['dateofdeath'].str.slice(5,7)

# Get the day, in slice [8:]
df['day'] = df['dateofdeath'].str.slice(8,None)

# Set a multi-index
df = df.set_index(['month', 'day'])

# Sort the index
df = df.sort_index()

# Get the rows from Feb 15th through July 15th, and the 'age' column, then the average
df.loc[('02', '15'):('07', '15'), 'age'].mean()

77.05183037332367

# Beyond 2

The CSV file contains another column, `causeofdeath`. Load that into a data frame, and find the five most common causes of death. Now replace any `NaN` values in that column with the string `'unknown'`, and again find the five most common causes of death.

In [5]:
filename = '../data/celebrity_deaths_2016.csv'

df = pd.read_csv(filename,
                usecols=['dateofdeath', 'age', 'causeofdeath'])

# get the five most common causes of death
df['causeofdeath'].value_counts().head()

causeofdeath
 cancer               248
 heart attack         125
 traffic collision     56
 lung cancer           51
 pneumonia             50
Name: count, dtype: int64

In [6]:
# Replace NaN with 'unknown'... and we get more than 5,000 such rows.
# This data set isn't very reliable when it comes to causes of death! 
df['causeofdeath'] = df['causeofdeath'].fillna('unknown')
df['causeofdeath'].value_counts().head()

causeofdeath
unknown               5008
 cancer                248
 heart attack          125
 traffic collision      56
 lung cancer            51
Name: count, dtype: int64

# Beyond 3

If someone asks whether cancer is in the top 10 causes, what would you say? Can we be more specific than that?

In [7]:
# we see that there is general "cancer," but also "lung cancer" and "pancreatic cancer."

# It's impossible to know whether just "cancer" means "other cancer," or that it wasn't
# classified well, or somethign else.

# Basically, this is an instructive data set because it is not very reliable, at least
# when it comes to causes of death. We would want something more rigorous in making serious decisions.
df['causeofdeath'].value_counts().head(10)

causeofdeath
unknown               5008
 cancer                248
 heart attack          125
 traffic collision      56
 lung cancer            51
 pneumonia              50
 heart failure          49
 shot                   42
 stroke                 36
 pancreatic cancer      35
Name: count, dtype: int64