# Split script

## Get df

In [1]:
import pandas as pd

# read file temp (from data script)
def read_local(path_filename):
    """
    read_local(path_filename)
    reads the csv file and parses date col as date, setting the date as the index
    returns the df
    """
    df = pd.read_csv(path_filename)
    df['Date'] = pd.to_datetime(df['Date'])
    return df.set_index('Date')
    
df = read_local("play.csv")
df


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,28.950001,29.082500,28.690001,29.037500,27.332472,115127600
2017-01-04,28.962500,29.127501,28.937500,29.004999,27.301876,84472400
2017-01-05,28.980000,29.215000,28.952499,29.152500,27.440716,88774400
2017-01-06,29.195000,29.540001,29.117500,29.477501,27.746637,127007600
2017-01-09,29.487499,29.857500,29.485001,29.747499,28.000778,134247600
...,...,...,...,...,...,...
2020-12-24,131.320007,133.460007,131.100006,131.970001,131.161407,54930100
2020-12-28,133.990005,137.339996,133.509995,136.690002,135.852509,124486200
2020-12-29,138.050003,138.789993,134.339996,134.869995,134.043640,121047300
2020-12-30,135.580002,135.990005,133.399994,133.720001,132.900696,96452100


##  OOP

In [3]:
play = df[df.index<='2019-12-31']



Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,28.950001,29.082500,28.690001,29.037500,27.332472,115127600
2017-01-04,28.962500,29.127501,28.937500,29.004999,27.301876,84472400
2017-01-05,28.980000,29.215000,28.952499,29.152500,27.440716,88774400
2017-01-06,29.195000,29.540001,29.117500,29.477501,27.746637,127007600
2017-01-09,29.487499,29.857500,29.485001,29.747499,28.000778,134247600
...,...,...,...,...,...,...
2020-06-24,91.250000,92.197502,89.629997,90.014999,89.148659,192623200
2020-06-25,90.175003,91.250000,89.392502,91.209999,90.332153,137522400
2020-06-26,91.102501,91.330002,88.254997,88.407501,87.556633,205256800
2020-06-29,88.312500,90.542503,87.820000,90.445000,89.574516,130646000


In [3]:
#OOP
from datetime import datetime as dt

class train_val_split:
    
    def __init__(self, df, duration=30, window=1, prediction_period=5, start = '2018-01-01', end = '2020-03-31'):
        self.duration = duration # training period, for dates
        self.window = window # rolling window freq, for dates
        self.prediction_period = prediction_period # prediciton horizon, for dates
        self.start = start
        self.end = end
        self.df = df.loc[(df.index >= start) & (df.index <= end)].reset_index()
        self.start_ind = self.df[self.df.Date>=self.start].index.min()
        
    @staticmethod
    def _strfdate(date):
        return dt.strftime(date, '%Y-%m-%d')
           
    
    def split_by_date(self):
        self.end_ind = self.start_ind + self.duration
        
        dates = []
        
        while self.end_ind <=  len(self.df) - self.prediction_period:
            date_start = self._strfdate(self.df.Date[self.start_ind])
            date_end = self._strfdate(self.df.Date[self.end_ind])
            
            dates.append([date_start, date_end])
            
            self.start_ind += self.window
            self.end_ind = self.start_ind + self.duration
            
        return dates
    
    def _chk_split(self, date_split):
        for n in range(len(date_split)-1):
            if date_split[n] >= date_split[n+1]:
                return False
        if date_split[0] < self.start:
            return False
        if date_split[-1] > self.end:
            return False
        return True
            
    
    def split_by_index(self, date_split = ['2018-09-30','2019-06-30','2020-03-31']):
        
        if not self._chk_split(date_split):
            return "Check date split again"
        
        ind_out = []            
        for n, date in enumerate(date_split):
            self.train_end_ind = self.df[self.df.Date<=date].index.max()
            
            if n < len(date_split)-1:
                val_end = self.df[self.df.Date<=date_split[n+1]].index.max()
            else:
                val_end = self.df.index.max()
            
            #check boundary_dates
            print(self.df.Date[self.train_end_ind], self.df.Date[self.train_end_ind+1], self.df.Date[val_end])
                
            ind_out.append((list(range(self.start_ind, self.train_end_ind+1)), list(range(self.train_end_ind+1, val_end+1))))
            
        return ind_out
    
    def get_val_map(self, start='2020-06-01', end='2020-12-31'):
        start_ind = self.df[self.df.Date>=start].index.min()
        end_ind = self.df[self.df.Date<=end].index.max()
        
        return {self._strfdate(self.df.Date[n-5]):self._strfdate(self.df.Date[n]) for n in range(start_ind, end_ind+1)}                
            

## Test

In [34]:
# Ivan and Derrick

split = train_val_split(df)
split.split_by_date()

[['2018-01-02', '2018-02-14'],
 ['2018-01-03', '2018-02-15'],
 ['2018-01-04', '2018-02-16'],
 ['2018-01-05', '2018-02-20'],
 ['2018-01-08', '2018-02-21'],
 ['2018-01-09', '2018-02-22'],
 ['2018-01-10', '2018-02-23'],
 ['2018-01-11', '2018-02-26'],
 ['2018-01-12', '2018-02-27'],
 ['2018-01-16', '2018-02-28'],
 ['2018-01-17', '2018-03-01'],
 ['2018-01-18', '2018-03-02'],
 ['2018-01-19', '2018-03-05'],
 ['2018-01-22', '2018-03-06'],
 ['2018-01-23', '2018-03-07'],
 ['2018-01-24', '2018-03-08'],
 ['2018-01-25', '2018-03-09'],
 ['2018-01-26', '2018-03-12'],
 ['2018-01-29', '2018-03-13'],
 ['2018-01-30', '2018-03-14'],
 ['2018-01-31', '2018-03-15'],
 ['2018-02-01', '2018-03-16'],
 ['2018-02-02', '2018-03-19'],
 ['2018-02-05', '2018-03-20'],
 ['2018-02-06', '2018-03-21'],
 ['2018-02-07', '2018-03-22'],
 ['2018-02-08', '2018-03-23'],
 ['2018-02-09', '2018-03-26'],
 ['2018-02-12', '2018-03-27'],
 ['2018-02-13', '2018-03-28'],
 ['2018-02-14', '2018-03-29'],
 ['2018-02-15', '2018-04-02'],
 ['2018-

In [4]:
# Brendan

split = train_val_split(df, end = '2020-12-31')
splits = split.split_by_index()

print()

for x in splits:
    print(x)

2018-09-28 00:00:00 2018-10-01 00:00:00 2019-06-28 00:00:00
2019-06-28 00:00:00 2019-07-01 00:00:00 2020-03-31 00:00:00
2020-03-31 00:00:00 2020-04-01 00:00:00 2020-12-31 00:00:00

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 18

In [4]:
test = train_val_split(df, end='2020-12-31')

test.get_val_map()

{'2020-05-22': '2020-06-01',
 '2020-05-26': '2020-06-02',
 '2020-05-27': '2020-06-03',
 '2020-05-28': '2020-06-04',
 '2020-05-29': '2020-06-05',
 '2020-06-01': '2020-06-08',
 '2020-06-02': '2020-06-09',
 '2020-06-03': '2020-06-10',
 '2020-06-04': '2020-06-11',
 '2020-06-05': '2020-06-12',
 '2020-06-08': '2020-06-15',
 '2020-06-09': '2020-06-16',
 '2020-06-10': '2020-06-17',
 '2020-06-11': '2020-06-18',
 '2020-06-12': '2020-06-19',
 '2020-06-15': '2020-06-22',
 '2020-06-16': '2020-06-23',
 '2020-06-17': '2020-06-24',
 '2020-06-18': '2020-06-25',
 '2020-06-19': '2020-06-26',
 '2020-06-22': '2020-06-29',
 '2020-06-23': '2020-06-30',
 '2020-06-24': '2020-07-01',
 '2020-06-25': '2020-07-02',
 '2020-06-26': '2020-07-06',
 '2020-06-29': '2020-07-07',
 '2020-06-30': '2020-07-08',
 '2020-07-01': '2020-07-09',
 '2020-07-02': '2020-07-10',
 '2020-07-06': '2020-07-13',
 '2020-07-07': '2020-07-14',
 '2020-07-08': '2020-07-15',
 '2020-07-09': '2020-07-16',
 '2020-07-10': '2020-07-17',
 '2020-07-13':