In [None]:
import numpy as np
import pandas as pd
import pylab as plt
import matplotlib

%matplotlib inline
%config InlineBackend.figure_format = 'retina' ##better figure resolution

In [None]:
cleaned_data_location = 'data/gapminder_cleaned.csv'
df = pd.read_csv(cleaned_data_location)
df.head()
# did not work :(

In [None]:
#try defensive coding now
cleaned_data_location = 'data/gapminder_cleaned.csv'

try:
    df = pd.read_csv(cleaned_data_location)
    
except FileNotFoundError:
    print("Could not find data file, chech path? You tried", cleaned_data_location)


In [None]:
VERBOSE = True 

#now try returning to parent directory first
cleaned_data_location = '../data/gapminder_cleaned.csv'

try:
    df = pd.read_csv(cleaned_data_location)
    if VERBOSE:
        print(df.head())
    
except FileNotFoundError:
    print("Could not find data file, chech path? You tried", cleaned_data_location)


In [None]:
years = df['year'].unique()
years.sort()
assert years[-1] == 2007, 'Wrong years'

In [None]:
#calculate mean life expectancy for continent asia for each year
category = 'lifeexp'
continent = 'asia'

mask_continent = df['continent'] == continent
df_continent = df[mask_continent]

years = df_continent['year'].unique()
summary = []

for year in years:
    mask_year = df_continent['year'] == year
    df_year = df_continent[mask_year]
    value = np.mean(df_year[category])
    summary.append((continent, year, value))

assert len(summary) == 12

#turn the summary into a data fram so we can easily visualize it
result_df = pd.DataFrame(summary, columns = ['continent', 'year', category])
result_df

In [None]:
result_df.plot.line('year', category, label='life expectancy')
plt.savefig('fig.png', dpi = 250) ##better resolution

In [None]:
def calculate_mean_over_time(data, category, continent, verbose=False):
    #calculate mean category for continent by all years
    
    #create a mask that selects the continent of choice
    mask_continent = data['continent'] == continent
    data_continent = data[mask_continent]

    #loop over years and calculate the statistic of interest
    years = data_continent['year'].unique()
    summary = []
    for year in years:
        if verbose:
            print(year)
        mask_year = data_continent['year'] == year
        data_year = data_continent[mask_year]
        value = np.mean(data_year[category])
        summary.append((continent, year, value))

    #There should be 12 records for each continent
    assert len(summary) == 12

    #turn the summary into a data fram so we can easily visualize it
    result_data = pd.DataFrame(summary, columns = ['continent', 'year', category])
    return result_data

In [None]:
calculate_mean_over_time(df, 'lifeexp', 'asia')

In [None]:
def calculate_stat_over_time(data, category, continent, func):
    """Calculate any stat of ny category for a continent by all years
    
    Args:
        data: a data frame
        category: one of the column headers of the data frame (e.g.'lifeexp')
        continent: possible value of continent column in that data frame (e.g. 'asia')
        func: the function to apply to data values (e.g. np.mean)
        
    Returns:
        a summary table of value per year
    """
    
    #validate arguments
    assert category in data.columns.values, 'Category must be one of the data frame columns'
    assert 'continent' in data.columns.values, 'Continent column not found in data frame'
    assert continent in data['continent'].unique(), 'Continent value not found in data frame'
    
    #create a mask that selects the continent of choice
    mask_continent = data['continent'] == continent
    data_continent = data[mask_continent]

    #loop over years and calculate the statistic of interest
    years = data_continent['year'].unique()
    summary = []
    for year in years:
        mask_year = data_continent['year'] == year
        data_year = data_continent[mask_year]
        value = func(data_year[category]) ##function pass in here; ex. np.mean
        #value = eval('np.' + func + '(data_year[category])') 
            #^more user friendly but only for numpy, argument is 'mean', 'median', etc
        summary.append((continent, year, value))

    #There should be 12 records for each continent
    assert len(summary) == 12

    #turn the summary into a data fram so we can easily visualize it
    result_data = pd.DataFrame(summary, columns = ['continent', 'year', category])
    return result_data

    

In [None]:
calculate_stat_over_time(df, 'lifeexp', 'asia', np.mean)

In [None]:
help(calculate_stat_over_time)

In [None]:
#plot life expectancy over time for all continents
continents = df['continent'].unique()
category = 'lifeexp'
fig, ax = plt.subplots()

for continent in continents:
    func = np.mean
    output = calculate_stat_over_time(df, category, continent, func)
    output.plot.line('year',category , ax=ax, label=continent)