In [None]:
import pandas as pd
import datetime
import math

In [None]:
intervals_train = [
    ('06:00:00', '08:00:00'),
    ('15:00:00', '17:00:00')
]

In [None]:
def date_parser_wea(strs):
    ans = []
    for s in strs:
        t = datetime.datetime.strptime(s, '%Y-%m-%d')
        minute = int(math.floor(t.minute / 20) * 20)
        t = datetime.datetime(t.year, t.month, t.day, t.hour, minute, 0)
        ans.append(t)
    return ans

def date_parser_vol(strs):
    ans = []
    for s in strs:
        t = datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
        minute = int(math.floor(t.minute / 20) * 20)
        t = datetime.datetime(t.year, t.month, t.day, t.hour, minute, 0)
        ans.append(t)
    return ans

# split datetime to date and time
def split_datetime(df):
    df_date = df['datetime'].apply(lambda x: x.date()).to_frame()
    df_time = df['datetime'].apply(lambda x: x.time()).to_frame()
    df_date.rename(index=str, columns={'datetime': 'date'}, inplace=True)
    df_time.rename(index=str, columns={'datetime': 'time'}, inplace=True)
    df = pd.concat([df_date, df_time, df], axis=1)
    df.drop('datetime', axis=1, inplace=True)
    df_date = None
    df_time = None
    return df

# Cut the specified dates in [begin, end]
def cut_date(df, begin, end):
    date_begin = datetime.datetime.strptime(begin, '%Y-%m-%d').date()
    date_end = datetime.datetime.strptime(end, '%Y-%m-%d').date()
    mask = (df['date'] >= date_begin) & (df['date'] <= date_end)
    return df[mask]

# Cut the specified time in [begin, end)
def cut_time(df, interval):
    time_begin = datetime.datetime.strptime(interval[0], '%H:%M:%S').time()
    time_end = datetime.datetime.strptime(interval[1], '%H:%M:%S').time()
    mask = (df['time'] >= time_begin) & (df['time'] < time_end)
    return df[mask]

In [None]:
class Features:
    def __init__(self, pathname, filename_wea, filename_vol, filename_tra):
        self.df_wea = None
        self.df_vol = None
        self.pathname = pathname
        self.filename_wea = filename_wea
        self.filename_vol = filename_vol
        self.filename_tra = filename_tra
        
    def read_wea(self):
        self.df_wea = pd.read_csv(self.pathname + self.filename_wea, parse_dates=[0], date_parser=date_parser_wea)
    
    def read_vol(self):
        self.df_vol = pd.read_csv(self.pathname + self.filename_vol, parse_dates=[0], date_parser=date_parser_vol)

        mean = self.df_vol['vehicle_type'].mean()
        self.df_vol['vehicle_type'].fillna(mean, inplace=True)

        self.df_vol.rename(index=str, columns={'time': 'datetime'}, inplace=True)
        self.df_vol.sort_values(by=['datetime'], inplace=True)
        
        self.df_vol = split_datetime(self.df_vol)
    
    def read_tra(self):
        self.df_tra = pd.read_csv('../dataSets/training/trajectories(table 5)_training.csv', parse_dates=[3], date_parser=date_parser_vol)
        
        self.df_tra.rename(index=str, columns={'starting_time': 'datetime'}, inplace=True)
        self.df_tra.drop(['vehicle_id', 'travel_seq'], axis=1, inplace=True)
        self.df_tra = split_datetime(self.df_tra)
        self.df_tra.sort_values(by=['date', 'time', 'intersection_id', 'tollgate_id'], inplace=True)

    def read_all(self):
        self.read_wea()
        self.read_vol()
        self.read_tra()
    
#     feat.get_wea(dates=['2016-09-20', '2016-09-26'], ampm='am')
    def get_wea(self, dates, ampm):
        if not isinstance(dates, list) and not isinstance(dates, tuple):
            dates = (dates, dates)
        
        df = cut_date(self.df_wea, dates[0], dates[1])
        hour = 6 if ampm == 'am' else 15
        df = df[df['hour'] == hour]
        df.drop(['date', 'hour'], axis=1, inplace=True)
        return df.values[0]

#     feat.get_vol(dates=['2016-09-20', '2016-09-26'], ampm='am', toll=1, dire=0)
    def get_vol(self, dates, ampm, toll, dire):
        if not isinstance(dates, list) and not isinstance(dates, tuple):
            dates = (dates, dates)
        
        df = cut_date(self.df_vol, dates[0], dates[1])
        idx = 0 if ampm == 'am' else 1
        df = cut_time(df, intervals_train[idx])
        mask = (df['tollgate_id'] == toll) & (df['direction'] == dire)
        return df[mask]
    
#     feat.get_tra(dates=['2016-09-20', '2016-09-26'], ampm='am', inte='A', toll=2)
    def get_tra(self, dates, ampm, inte, toll):
        if not isinstance(dates, list) and not isinstance(dates, tuple):
            dates = (dates, dates)
        
        df = cut_date(self.df_tra, dates[0], dates[1])
        idx = 0 if ampm == 'am' else 1
        df = cut_time(df, intervals_train[idx])
        mask = (df['intersection_id'] == inte) & (df['tollgate_id'] == toll)
        return df[mask]


In [None]:
feat = Features(
    '../dataSets/training/',
    'weather (table 7)_training.csv',
    'volume(table 6)_training.csv',
    '../dataSets/training/trajectories(table 5)_training.csv'
)

feat.read_all()