In [None]:
!pip install pandas --user

In [None]:
!pip install matplotlib --user

In [None]:
!pip install argparse --user

In [None]:
# Import Libraries
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
"""This function takes as input a path and a productID associated to a table obtained from StatsCan
    In this exercise we will explore the dataset https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310011101#tables 
""" 
def read_data_compute_df(path_to_data):
    # Read zipped file
    zf = zipfile.ZipFile(path_to_data) 
    # Parse csv content without unzipping !!!
    df = pd.read_csv(zf.open(str(productID)+'.csv'),low_memory=False)
    
    return df

In [None]:
# Read data
path = ""
cancer_data = read_data_compute_df()

In [None]:
# Clean up data
clean_cancer_data= cancer_data.drop(columns=["DGUID","UOM_ID","UOM","SCALAR_ID","VECTOR","COORDINATE","STATUS","TERMINATED",\
              "DECIMALS","SYMBOL"])

In [None]:
# Get data subsets
dates = clean_cancer_data.iloc[:,0].unique()
geog = clean_cancer_data.iloc[:,1].unique()
age_group = clean_cancer_data.iloc[:,2].unique()
sex = clean_cancer_data.iloc[:,3].unique()
cancer_type = clean_cancer_data.iloc[:,4].unique()
characteristics = clean_cancer_data.iloc[:,5].unique()

In [None]:
# Specify values
year = dates[15]
esex = sex[0]
eage = age_group[7]
geoLoc = geog[0]
# I will use Lung and bronchus in this example. All other cancer types are stored in the cancer_type variable
cancer = 'Lung and bronchus [C34.0-C34.9]'

In [None]:
# Extract subsets of data
df_subdata_time = clean_cancer_data[(clean_cancer_data['REF_DATE'] == year)]
df_subdata_time_sex = df_subdata_time[df_subdata_time["Sex"]==esex]
df_subdata_time_sex_age = df_subdata_time_sex#[df_subdata_time_sex["Age group"]==eage]
df_subdata_time_sex_age_geo = df_subdata_time_sex_age[df_subdata_time_sex_age["GEO"]==geoLoc]
df_subdata_time_sex_age_geo_canc = df_subdata_time_sex_age_geo[df_subdata_time_sex_age_geo["Primary types of cancer (ICD-O-3)"]==cancer] 
df_subdata_time_sex_age_geo_canc_char = df_subdata_time_sex_age_geo_canc[df_subdata_time_sex_age_geo_canc["Characteristics"]=="Number of new cancer cases"]
df_subdata_time_sex_age_geo_canc_char.head(10)

In [None]:
# Plot our table
# Specify figure size
figure(figsize=(15,10))
# Plot a bar chart of data subset
plt.bar(df_subdata_time_sex_age_geo_canc_char["Age group"],df_subdata_time_sex_age_geo_canc_char["VALUE"],label=cancer);
# Add a title
plt.title(str(df_subdata_time_sex_age_geo_canc_char["Characteristics"].unique()) + ", " + str(year),fontsize=25)
# Add a legend
plt.legend(loc='upper left',bbox_to_anchor=(0.7,1))
# Add x and y labels
plt.ylabel("Number of individuals",fontsize=15)
plt.xlabel("Age group",fontsize=15);
# Formatting the plot: rotate x axis labels
plt.xticks(rotation=90);
# Same image to png
plt.savefig(str(df_subdata_time_sex_age_geo_canc_char["Characteristics"].unique()) + "_" + str(df_subdata_time_sex_age_geo_canc_char["Primary types of cancer (ICD-O-3)"].unique()) + '.png')