In [None]:
# Data set: New York City Leading Causes of Death
# https://data.cityofnewyork.us/Health/New-York-City-Leading-Causes-of-Death/jb7j-dtam

%matplotlib inline
import requests
import json
import pandas as pd
import numpy as np
  

In [None]:
url = 'http://data.cityofnewyork.us/api/views/jb7j-dtam/rows.json'
resp = requests.get(url)

results = json.loads(resp.text) 

In [None]:
# This part of the results contains the data
data = results["data"]

# Let's create a pandas dataframe
df = pd.DataFrame(data)
df

In [None]:
# Kind of ugly without column names...

# This part of the results contains the description and names for the columns
columns = results["meta"]["view"]["columns"]

# We will create a list of the column names, to reuse it when creating our dataframe
headers = []
for c in columns:
    headers.append(c["fieldName"])

# Now we also pass a list of column names
df = pd.DataFrame(data, columns=headers)
df

In [None]:
# We do not need all these columns. Let's drop a few that we will definitely not use
#
# The "axis=1" says that we are looking to drop columns
# FYI, If we had "axis=0" we would be dropping rows with the passed id's
#
# The "inplace=True" specifies that we will not be creating a new dataframe, but we just replace the current one,
# with the new dataframe that has fewer columns.
#
df.drop(labels = [':sid', ':position', ':meta', ':created_meta', ':updated_meta'], axis=1, inplace=True)
df

In [None]:
# We do not like come of these column names. Let's rename them

# We will use a dictionary, for specifying the existing and the new names for the columns
renaming_dict = {
    ':id': 'key', 
    ':created_at': 'created_at', 
    ':updated_at': 'updated_at'
}

df.rename(columns=renaming_dict, inplace=True)
df

In [None]:
# We can specify that the "key" column is the primary key for the table
df.set_index(keys="key", inplace=True)
df

In [None]:
df.dtypes

In [None]:
# Let's convert to the right data types the year,count,percent
df["year"] = pd.to_numeric(df["year"])
df["count"] = pd.to_numeric(df["count"])
df["percent"] = pd.to_numeric(df["percent"])
df.dtypes

In [None]:
# And we will also convert the timestamps to dates

# Equivalent to 
# import datetime
# df["created_at"] = map(datetime.datetime.utcfromtimestamp, df["created_at"])

df["created_at"] = pd.to_datetime(df["created_at"], unit='s')
df["updated_at"] = pd.to_datetime(df["updated_at"], unit='s')
df.dtypes


In [None]:

df["sex"] = pd.Categorical(df["sex"])
df["ethnicity"] = pd.Categorical(df["ethnicity"])
df["cause_of_death"] = pd.Categorical(df["cause_of_death"])
df.dtypes

In [None]:
df

In [None]:
df["ethnicity"].value_counts()

In [None]:
df["cause_of_death"].value_counts()

In [None]:
df["sex"].value_counts()

In [None]:
# Let's create a pivot table now
import numpy as np
pivot = pd.pivot_table(df, values='count', index=['cause_of_death'], columns=['sex', 'ethnicity'], aggfunc=np.sum)
pivot

In [None]:
# And we can easily transpose the dataframe
pivot.transpose()

In [None]:
# And we can of course, plot:
pivot.transpose()["DISEASES OF HEART"].plot.bar()