Build a function that takes training data and splits into groups of 3 folds, where each fold corresponds to a year of data.  1st 2 folds are training data and last fold is test data.  

| Training | Test |
|--|--|
|2006/2007| 2008 |
|2007/2008| 2009 |
|2008/2009| 2010 |
| ... | ... |
|2013/2014| 2015 |

Prototype steps:
1. get all valid data for a single crossing
2. split data into 8 groups

In [8]:
from dbhelper import PgDB, get_crossings, get_crossingdata, pd_query
import pandas as pd

In [6]:
crossings = get_crossings()
crossings

Unnamed: 0_level_0,location_id,lane_id,direction_id,location_name,lane_name,direction_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15,136,0,0,Lynden,Car,Northbound
16,136,0,1,Lynden,Car,Southbound
17,136,4,0,Lynden,Truck,Northbound
18,136,4,1,Lynden,Truck,Southbound
8,135,2,0,Pacific Highway,Bus,Northbound
7,135,2,1,Pacific Highway,Bus,Southbound
6,135,0,0,Pacific Highway,Car,Northbound
5,135,0,1,Pacific Highway,Car,Southbound
10,135,3,0,Pacific Highway,Fast,Northbound
9,135,3,1,Pacific Highway,Fast,Southbound


In [28]:
df = pd_query('select date from crossingdata where valid=1 and crossing_id=1 order by date;', ())

In [29]:
df.head()

Unnamed: 0,date
0,2007-01-01 00:00:00
1,2007-01-01 00:05:00
2,2007-01-01 00:10:00
3,2007-01-01 00:15:00
4,2007-01-01 00:20:00


In [54]:
import numpy as np
import datetime

def cv_folds(X, years=3):
    min_year = X.date.min().year
    max_year = X.date.max().year
        
    cv = []
    test_year = min_year + years - 1
    
    while test_year <= max_year:
        train = X[(X.date >= datetime.date(test_year - years + 1, 1, 1)) & (X.date < datetime.date(test_year, 1, 1))]
        test = X[(X.date >= datetime.date(test_year, 1, 1)) & (X.date < datetime.date(test_year + 1, 1, 1))]
        cv.append((list(train.index), list(test.index)))
        
        test_year += 1

    return cv

cv = cv_folds(df)

In [55]:
def print_top(cv, n=10):
    for i in range(len(cv)):
        print "***", i
        for j in range(2):
            print len(cv[i][j]), '--', cv[i][j][:5]
            
print_top(cv, 5)

*** 0
168412 -- [0, 1, 2, 3, 4]
104557 -- [168412, 168413, 168414, 168415, 168416]
*** 1
180713 -- [92256, 92257, 92258, 92259, 92260]
101491 -- [272969, 272970, 272971, 272972, 272973]
*** 2
206048 -- [168412, 168413, 168414, 168415, 168416]
101973 -- [374460, 374461, 374462, 374463, 374464]
*** 3
203464 -- [272969, 272970, 272971, 272972, 272973]
97619 -- [476433, 476434, 476435, 476436, 476437]
*** 4
199592 -- [374460, 374461, 374462, 374463, 374464]
100605 -- [574052, 574053, 574054, 574055, 574056]
*** 5
198224 -- [476433, 476434, 476435, 476436, 476437]
103579 -- [674657, 674658, 674659, 674660, 674661]
*** 6
204184 -- [574052, 574053, 574054, 574055, 574056]
94340 -- [778236, 778237, 778238, 778239, 778240]


In [50]:
from modelhelper import cvfolds
cv = cvfolds(df, 5)

In [53]:
print_top(cv, 5)

*** 0
374460 -- [0, 1, 2, 3, 4]
101973 -- [374460, 374461, 374462, 374463, 374464]
*** 1
384177 -- [92256, 92257, 92258, 92259, 92260]
97619 -- [476433, 476434, 476435, 476436, 476437]
*** 2
405640 -- [168412, 168413, 168414, 168415, 168416]
100605 -- [574052, 574053, 574054, 574055, 574056]
*** 3
401688 -- [272969, 272970, 272971, 272972, 272973]
103579 -- [674657, 674658, 674659, 674660, 674661]
*** 4
403776 -- [374460, 374461, 374462, 374463, 374464]
94340 -- [778236, 778237, 778238, 778239, 778240]
