In [2]:
# importing data manipulation as well as plotting packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [7]:
# importing the environment canada data set with the date as the index column
gwv_data = pd.read_csv('data/c46131.csv', sep=',', index_col='DATE')

# Deleting empty columns
del gwv_data['WSS$']
del gwv_data['WSS$.1']

In [33]:
zoo= pd.read_csv('data/Zooplankton_2009-2012.csv', sep=',', index_col='Date')
zoo.index = pd.to_datetime(zoo.index, unit='d')

In [37]:
gwv_data.index = pd.to_datetime(gwv_data.index, unit='m')

    # Collapsing the data so that all data are averaged over a weekly perdiod
gwv_weekly = gwv_data.resample('W', how=('mean'))

# Test Driven Development

For my final project, I want to be able to create a series of timeseries plots that include my three variables of focus (zooplankton abundance, sea surface temperature, and significant wave height) for each year of the zooplnakton data (2009-2012). In order to make these plots in an organized fashion I need to pull the zooplankton data and the environmental data from each of thier respective dataframes for each spring (2009-2012). Instead of repeatedly using a code for each year to match both of my datatables' timeseries for plotting purposes, I think it would be easier to deveop a function that would do this automatically. I want to make this function generalized to perform for any two of my datasets (the combination of environmental data and spring bloom data). Ideally, function would convert each dataset into a datatable that would contain weekly averages of the data over the spring season for a desired year. Thus I am envisioning the function requiring 3 inputs, the name of my two datasets,  and the year of the spring desired by the user. 

To develop this function, I will use test driven development. Since this is predominantly a data munging funciton, I will test its functionality by indexing the data point in the 1st column and the 4th row. I compare this input in the observation and the expectation inside the test functions. For the functions acutal use however, I plan to make the it return the new dataframes for both datasets. Additionally, my function will assume that both datasets have a date based index. Below I will build my function piece by peice and use a new test each time to ensure its functionality. 

This first step of the function aims to resample any general dataset and average it over a week long period for any general dataset.

In [97]:
def organize_1(dataset_1):
    '''This function will average data per week for any general dataset'''
    # Ensuring dataset_1 index (time) is in pandas datetime format
    dataset_1.index = pd.to_datetime(dataset_1.index, unit='m')

    # Resampling dataset_1 into weekly averages
    dataset_1_weekly = dataset_1.resample('W', how=('mean'))
    
    # producing an output to test the function
    return dataset_1_weekly.iloc[1,4]

In [98]:
def test_organize_1():
    # Chagning the data index (time) into pandas datetime format
    gwv_data.index = pd.to_datetime(gwv_data.index, unit='m')

    # Collapsing the data so that all data are averaged over a weekly perdiod
    gwv_weekly = gwv_data.resample('W', how=('mean'))
    
    # Testing my manual method of data resampling to the automated function
    # I am indexing the 1st column and 4th row to compare these methods
    expected = gwv_weekly.iloc[1,4]
    obs = organize_1(gwv_data)
    assert expected == obs

In [99]:
test_organize_1()

In [111]:
def organize_2(dataset_1, year):
    '''This function will average data per week over the spring of 2010 for any general dataset'''
    # Ensuring dataset_1 index (time) is in pandas datetime format
    dataset_1.index = pd.to_datetime(dataset_1.index, unit='m')

    # Resampling dataset_1 into weekly averages
    dataset_1_weekly = dataset_1.resample('W', how=('mean'))
    
    # Subsetting the data for the spring plankton bloom of 2010 if year=2010 is specified
    if year == 2010:
        last = '2010-6-28'
        before = '2010-03-03'
    dataset_1_year_weekly = dataset_1_weekly[before:last]
           
    # producing an output to test the function    
    return dataset_1_year_weekly.iloc[1,4]





In [112]:
def test_organize_2():
    
    # Chagning the gwv_data index (time) into pandas datetime format
    gwv_data.index = pd.to_datetime(gwv_data.index, unit='m')

    # Collapsing the gwv_data so that all data are averaged over a weekly perdiod
    gwv_weekly = gwv_data.resample('W', how=('mean'))
    
    # Subsetting the gwv_data for the spring of 2010
    gwv_year_weekly = gwv_weekly['2010-03-03':'2010-6-28']
    
    # Testing my manual method of data resampling to the automated function
    # I am indexing the 1st column and 4th row to compare these methods
    expected = gwv_year_weekly.iloc[1,4] 
    obs = organize_2(gwv_data, 2010)
    assert expected == obs

In [113]:
test_organize_1()
test_organize_2()

In [114]:
def organize_3(dataset_1, year):
    '''This function will average data per week over the spring of 2010 or 2009 
    for any general dataset'''
    # Ensuring dataset_1 index (time) is in pandas datetime format
    dataset_1.index = pd.to_datetime(dataset_1.index, unit='m')

    # Resampling dataset_1 into weekly averages
    dataset_1_weekly = dataset_1.resample('W', how=('mean'))
    
    # Subsetting the data for the spring plankton bloom of either 2010 or 2009 depending on the year specified
    if year == 2010:
        last = '2010-6-28'
        before = '2010-03-03'
    if year == 2009:
        before ='2009-02-24'
        last = '2009-07-05' 
    dataset_1_year_weekly = dataset_1_weekly[before:last]
    
    # producing an output to test the function
    return dataset_1_year_weekly.iloc[1,4]


In [116]:
def test_organize_3():
    
    # Chagning the data index (time) into pandas datetime format
    gwv_data.index = pd.to_datetime(gwv_data.index, unit='m')

    # Collapsing the data so that all data are averaged over a weekly perdiod
    gwv_weekly = gwv_data.resample('W', how=('mean'))
    
    # Subsetting the gwv_data for the spring of 2009
    gwv_year_weekly = gwv_weekly['2009-02-24':'2009-07-05' ]
    
    # Testing my manual method of data resampling to the automated function
    # I am indexing the 1st column and 4th row to compare these methods
    expected = gwv_year_weekly.iloc[1,4]
    obs = organize_3(gwv_data, 2009)
    assert expected == obs

In [117]:
test_organize_1()
test_organize_2()
test_organize_3()

In [118]:
def organize_4(dataset_1, year):
    '''This function will average data per week over the spring of 2012, 2011, 2010, 
    or 2009 for any general dataset'''
    # Ensuring dataset_1 index (time) is in pandas datetime format
    dataset_1.index = pd.to_datetime(dataset_1.index, unit='m')

    # Resampling dataset_1 into weekly averages
    dataset_1_weekly = dataset_1.resample('W', how=('mean'))
    
    # Subsetting the data for the spring plankton bloom of 2012, 2011, 2010 or 2009 depending on the year specified
    if year == 2009:
        first ='2009-02-24'
        last = '2009-07-05' 
    if year == 2010:
        first = '2010-03-03'
        last = '2010-6-28'   
    if year == 2011:
        first = '2011-03-07'
        last = '2011-06-30'
    if year == 2012:
        first ='2012-03-30'
        last = '2012-06-25'
    dataset_1_year_weekly = dataset_1_weekly[first:last]
    
    # producing an output to test the function
    return dataset_1_year_weekly.iloc[1,4]


In [119]:
def test_organize_4():
     
    # Chagning the data index (time) into pandas datetime format
    gwv_data.index = pd.to_datetime(gwv_data.index, unit='m')

    # Collapsing the data so that all data are averaged over a weekly perdiod
    gwv_weekly = gwv_data.resample('W', how=('mean'))
    
     # Subsetting the gwv_data for the spring of 2011
    gwv_year_weekly = gwv_weekly['2011-03-07':'2011-06-30']
    
    
    # Testing my manual method of data resampling to the automated function
    # I am indexing the 1st column and 4th row to compare these methods
    expected = gwv_year_weekly.iloc[1,4]
    obs = organize_4(gwv_data, 2011)
    assert expected == obs

In [120]:
test_organize_1()
test_organize_2()
test_organize_3()
test_organize_4()

In [136]:
def organize_5(dataset_1, year, dataset_2):
    '''This function will average data per week over the spring of 2012, 2011, 2010 or 
    2009 for my two datasets'''
    # Ensuring dataset_1 index (time) is in pandas datetime format
    dataset_1.index = pd.to_datetime(dataset_1.index, unit='m')

    # Resampling dataset_1 into weekly averages
    dataset_1_weekly = dataset_1.resample('W', how=('mean'))
    
    # Subsetting the dataset_1 for the spring plankton bloom in the 2010
    if year == 2009:
        first ='2009-02-24'
        last = '2009-07-05' 
    if year == 2010:
        first = '2010-03-03'
        last = '2010-6-28'   
    if year == 2011:
        first = '2011-03-07'
        last = '2011-06-30'
    if year == 2012:
        first ='2012-03-30'
        last = '2012-06-25'
    dataset_1_year_weekly = dataset_1_weekly[first:last]
    
    

    # dropping the rows where naN's are present in dataset_1
    dataset_1_year_weekly = dataset_1_year_weekly.dropna()
    
    # Ensuring dataset_2 index (time) in in pandas datetime format
    zoo.index = pd.to_datetime(zoo.index, unit='d')
    
    # Now resampling dataset_2 into weekly averages over the desired spring
    if year == 2009:
        dataset_2_year = dataset_2['2009']
    if year == 2010:
        dataset_2_year = dataset_2['2010']
    if year == 2011:
        dataset_2_year = dataset_2['2011']
    if year == 2012:
        dataset_2_year = dataset_2['2012']
    dataset_2_year_weekly = dataset_2_year.resample('W', how=('mean'))
    
    return dataset_2_year_weekly.iloc[1,4]

In [147]:
def test_organize_5():
    # Resampling gwv_data (environment Canada data) into weekly averages 
    # Chagning the data index (time) into pandas datetime format
    gwv_data.index = pd.to_datetime(gwv_data.index, unit='m')

    # Collapsing the data so that all data are averaged over a weekly perdiod
    gwv_weekly = gwv_data.resample('W', how=('mean'))
    
    # Subsetting the gwv_data for the spring of 2011
    gwv_2011_weekly = gwv_weekly['2011-03-07':'2011-06-30']
    
    # Subsetting dataset_2 for the 2011 season
    zoo_2011 = zoo['2011']
    
    # Resampling subset of dataset_2 into weekly averages
    zoo_2011_weekly = zoo_2011.resample('W', how=('mean'))
    
    # Testing my manual method of data resampling to the automated function
    # I am indexing the 1st column and 4th row to compare these methods
    expected = zoo_2011_weekly.iloc[1,4] 
    obs = organize_5(gwv_data, 2011, zoo)
    
    assert expected == obs

In [148]:
test_organize_1()
test_organize_2()
test_organize_3()
test_organize_4()
test_organize_5()