In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import datetime as dt

In [3]:
base_path = 'data/I88N-processed/'

# Initialization

## Sample dates, split train/test dataset

In [4]:
dates = pd.read_csv(base_path + 'available_dates.csv')
dates = np.array(dates['0'].values.tolist())
dates = np.array(list(map(lambda x: x.split('-')[1] + '/' + x.split('-')[2] + '/' + x.split('-')[0], dates)))

We want to sample dates in June.

In [5]:
dates = dates[(dates > '05/31/2017') & (dates < '07/01/2017')]

In [6]:
dates_train = np.random.choice(dates, 45, replace=True)
dates_train.sort()
dates_test = np.array(sorted(set(dates).difference(set(dates_train.tolist()))))

In [7]:
len(dates), len(dates_train), len(dates_test), dates_train[0:3], dates_test[0:3]

(29,
 45,
 7,
 array(['06/01/2017', '06/01/2017', '06/01/2017'], dtype='<U10'),
 array(['06/02/2017', '06/04/2017', '06/07/2017'], dtype='<U10'))

## Loading severity data

In [8]:
severity_data = pd.read_csv(base_path + 'severity_params.csv')
severity_data.rename(columns={'Unnamed: 0':'datetime'}, inplace=True)

In [9]:
sev_datetimes = severity_data['datetime'].values

In [10]:
sev_dates = []
sev_times = []
for x in sev_datetimes:
    d, t = x.split(' ')
    sev_dates.append(d)
    sev_times.append(t)

In [11]:
severity_data['Date'] = sev_dates
severity_data['Time'] = sev_times

In [12]:
severity_data = severity_data.loc[~severity_data['Date'].isin(['2017-06-15'])]

In [13]:
lambda_max = severity_data['LambdaMax'].values
sigma = severity_data['Sigma'].values
tau = severity_data['Tau'].values
impact = severity_data['Impact'].values

In [14]:
severity_data.head(3)

Unnamed: 0,datetime,ID,LambdaMax,Sigma,Tau,Impact,Incident,Date,Time
0,2017-06-01 00:00:00,408907,,,,0.021267,0.0,2017-06-01,00:00:00
1,2017-06-01 00:05:00,408907,,,,0.017058,0.0,2017-06-01,00:05:00
2,2017-06-01 00:10:00,408907,,,0.0,0.015338,0.0,2017-06-01,00:10:00


In [15]:
lambda_max = [0 if np.isnan(x) else x for x in lambda_max]
sigma = [0 if np.isnan(x) else x for x in sigma]
tau = [0 if np.isnan(x) else x for x in tau]

In [16]:
severity_data['LambdaMax'] = lambda_max
severity_data['Sigma'] = sigma
severity_data['Tau'] = tau

## Loading speed, flow, occupancy, and stations

In [17]:
raw = pd.read_csv(base_path + 'concat_no_holes/concat.csv')

In [18]:
# select raw that is sampled
raw_all = raw.loc[raw['Date'].isin(dates)]

In [19]:
raw_all['LambdaMax'] = lambda_max
raw_all['Sigma'] = sigma
raw_all['Tau'] = tau
raw_all['Impact'] = impact

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [20]:
raw_test = raw_all.loc[raw['Date'].isin(dates_test)]
stations = np.array(raw_all['Station ID'].unique().tolist())

In [21]:
raw_all.head(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,idx,LambdaMax,Sigma,Tau,Impact
8640,408907,2017-06-01 00:00:00,0.5,22.0,69.5,06/01/2017,00:00,42322,0.0,0.0,0.0,0.021267
8641,408907,2017-06-01 00:05:00,0.5,22.0,69.1,06/01/2017,00:05,42323,0.0,0.0,0.0,0.017058
8642,408907,2017-06-01 00:10:00,0.5,23.0,68.9,06/01/2017,00:10,42324,0.0,0.0,0.0,0.015338


In [22]:
# special construction of raw_train, because the dates are sampled with replacement
raw_train = pd.DataFrame()
duplicate_id = 0
for i in range(0, len(dates_train)):
    if i > 0:
        if dates_train[i] == dates_train[i-1]:
            duplicate_id += 1
        else:
            duplicate_id = 0
    df_date = raw_all.loc[raw_all['Date'] == dates_train[i]]
    df_date = df_date.assign(duplicateIdx=duplicate_id)
    raw_train = raw_train.append(df_date)
sorterIdx = dict( zip(stations, range(len(stations))) )
raw_train['stationSorterIdx'] = raw_train['Station ID'].map(sorterIdx)
raw_train = raw_train.sort_values(['stationSorterIdx', 'duplicateIdx', 'datetime'], ascending=[True, True, True])
raw_train.drop(['duplicateIdx', 'stationSorterIdx', 'idx'], axis=1, inplace=True)

In [23]:
len(raw_train.index)

1321920

In [24]:
var_names = ['Speed', 'Flow', 'Occupancy']

### Construct road segments

In [25]:
road_segments = list()
for i in range(len(stations) - 1):
    road_segments.append(tuple([stations[i], stations[i+1]]))

## Loading incidents

In [26]:
raw_incidents = pd.read_csv(base_path + 'valid_incidents.csv')

In [27]:
raw_incidents_all = raw_incidents.loc[raw_incidents['Date'].isin(dates)]
raw_incidents_train = raw_incidents_all.loc[raw_incidents_all['Date'].isin(dates_train)]
raw_incidents_test = raw_incidents_all.loc[raw_incidents_all['Date'].isin(dates_test)]

In [28]:
svm_pos_timestamps = pd.read_csv(base_path + 'svm_pos_instances.csv')

In [29]:
svm_pos_timestamps_train = svm_pos_timestamps.loc[svm_pos_timestamps['Date'].isin(dates_train)]
svm_pos_timestamps_test = svm_pos_timestamps.loc[svm_pos_timestamps['Date'].isin(dates_test)]

In [30]:
svm_pos_timestamps.head(3)

Unnamed: 0,Upstream,Downstream,Date,Time
0,408907,400951,01/22/2017,20:30
1,408907,400951,01/22/2017,20:35
2,408907,400951,01/22/2017,20:40


In [31]:
svm_incident_dates_train = svm_pos_timestamps_train['Date'].unique().tolist()
svm_normal_dates_train = list(set(dates_train).difference(svm_incident_dates_train))

In [32]:
len(svm_incident_dates_train), len(svm_normal_dates_train)

(22, 0)

## Progress message formatting

In [33]:
def fraction_msg(present, total):
    return '[{}/{}]'.format(present, total)

# Train: TSA-DES forecasting

In [34]:
def DES_rmse(alpha, var_series):
    len_series = len(var_series)
    
    beta = round(1. - alpha, 3)

    sse = 0.
    s1 = np.mean(var_series[:10])
    s2 = s1
    
    for i in range(11, len_series - 1):
        s1 = alpha * var_series[i] + beta * s1
        s2 = alpha * s1 + beta * s2
        y_next = 2 * s1 - s2 + alpha / beta * (s1 - s2)
        sse += (var_series[i+1] - y_next) ** 2
    
    return np.sqrt( sse / (len_series - 12) )

## Tune best alphas for each station

In [35]:
import multiprocessing as mp

In [36]:
# input: stations, raw training data (includ. incidents), rule to update alphas
# output: a dictionary containing stations, and stations' best alphas
def compute_best_alphas(stations, raw_train, raw_incidents_train, dates_train, num_grids, DES_rmse, fraction_msg):
    best_alphas = {
    'Station ID': [],
    'Speed': [],
    'Flow': [],
    'Occupancy': []
    }
    pid = mp.current_process().pid
    for i, station in enumerate(stations):
        best_alphas['Station ID'].append(station)

        # update current training station dataframe, the training data is normal day's data
        abnormal_dates_station = raw_incidents_train.loc[(raw_incidents_train['Upstream'] == station) | (raw_incidents_train['Downstream'] == str(station))]['Date'].unique()
        normal_dates_train = np.array(list(set(dates_train).difference(set(abnormal_dates_station))))
        df_train_station = raw_train.loc[(raw_train['Station ID'] == station) & (raw_train['Date'].isin(normal_dates_train))]

        print("{} {} Tuning alphas for station {}...".format(pid, fraction_msg(i+1, len(stations)), station))
        for var_name in var_names:
            # print("    " + var_name + "...")
            var_series = df_train_station[var_name].values
            len_series = len(var_series)

            # setting up alphas
            alphas = np.arange(num_grids) * 1. / num_grids

            # save the historical best alpha by rmse
            best_rmse = float("inf")
            best_alpha = 0.

            # for each alpha, perform exponential smoothing, and compute RMSE
            for alpha in alphas:
                rmse = DES_rmse(alpha, var_series)

                # compare, and decide whether to update best alpha
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_alpha = alpha

            # finally, save the best alpha for the variable at this station
            best_alphas[var_name].append(best_alpha)

        # print trained alphas for each station
        # print(best_alphas['Station ID'][i], best_alphas['Speed'][i], best_alphas['Flow'][i], best_alphas['Occupancy'][i])
    print("Process {} has finished alpha tuning.".format(pid))
    return best_alphas

In [37]:
pool = mp.Pool(processes=8)
num_grids = 100
results = [pool.apply_async(compute_best_alphas, args=(stations[13 * i: 13 * i + 13], raw_train, raw_incidents_train, dates_train, num_grids, DES_rmse, fraction_msg)) for i in range(0, 8)]
pool.close()
pool.join()

19242 [1/13] Tuning alphas for station 408907...
19243 [1/13] Tuning alphas for station 400088...
19244 [1/13] Tuning alphas for station 408755...
19245 [1/13] Tuning alphas for station 400137...
19246 [1/13] Tuning alphas for station 400611...
19247 [1/13] Tuning alphas for station 400275...
19248 [1/13] Tuning alphas for station 400333...
19249 [1/11] Tuning alphas for station 400980...
19243 [2/13] Tuning alphas for station 402288...
19242 [2/13] Tuning alphas for station 400951...
19245 [2/13] Tuning alphas for station 400716...
19244 [2/13] Tuning alphas for station 402802...
19247 [2/13] Tuning alphas for station 400939...
19246 [2/13] Tuning alphas for station 400928...
19248 [2/13] Tuning alphas for station 410363...
19249 [2/11] Tuning alphas for station 401333...
19243 [3/13] Tuning alphas for station 413026...
19242 [3/13] Tuning alphas for station 400057...
19245 [3/13] Tuning alphas for station 401545...
19249 [3/11] Tuning alphas for station 404746...
19244 [3/13] Tuning 

In [38]:
exec_results = []
for proc in results:
    exec_results.append(proc.get())

In [39]:
best_alphas = {
    'Station ID': [],
    'Speed': [],
    'Flow': [],
    'Occupancy': []
}
for dict_best_alphas in exec_results:
    for key in best_alphas.keys():
        best_alphas[key].extend(dict_best_alphas[key])

In [40]:
best_alphas

{'Station ID': [408907,
  400951,
  400057,
  400147,
  400343,
  401560,
  400045,
  400122,
  401541,
  402281,
  402283,
  402285,
  402286,
  400088,
  402288,
  413026,
  401464,
  401489,
  401538,
  402290,
  402292,
  401643,
  402800,
  402828,
  407219,
  402789,
  408755,
  402802,
  408756,
  400189,
  400309,
  400417,
  400249,
  401639,
  400662,
  400141,
  400761,
  400490,
  401888,
  400137,
  400716,
  401545,
  401011,
  400674,
  400539,
  400534,
  401062,
  401529,
  401613,
  400536,
  400488,
  401561,
  400611,
  400928,
  400284,
  400041,
  408133,
  408135,
  417665,
  412637,
  417666,
  408134,
  400685,
  401003,
  400898,
  400275,
  400939,
  400180,
  400529,
  400990,
  400515,
  400252,
  400788,
  401517,
  401871,
  400574,
  401629,
  400422,
  400333,
  410363,
  400360,
  400955,
  400495,
  400608,
  400949,
  400678,
  400341,
  400607,
  400094,
  400682,
  408138,
  400980,
  401333,
  404746,
  401142,
  400218,
  400983,
  400765,
  4008

In [41]:
best_alphas_df = pd.DataFrame(best_alphas)
best_alphas_df.to_csv(base_path + 'smaller_sample/best_alphas.csv', index=False)

## Using the tuned alphas to predict training traffic variables

In [42]:
# initialization
# initialize prediction dictionary
pred_dict_train = dict()
for var_name in var_names:
    pred_dict_train[var_name] = []

for i, station in enumerate(stations):
    print("{} Start time series prediction (DES) at station {}...".format(fraction_msg(i+1, len(stations)), station))
    df_train_station = raw_train.loc[raw_train["Station ID"] == station]
    
    # formulate predictions of speed, flow and occupancy for the station
    for var_name in var_names:
        print("    {}...".format(var_name))
        var_series = df_train_station[var_name].values
        len_series = len(var_series)
        # initialize s1, s2, and y
        s1 = np.mean(var_series[:10])
        s2 = s1
        y = [0.] * len_series
        # get the best alpha
        var_best_alpha = best_alphas_df.loc[best_alphas_df["Station ID"] == station][var_name].values[0]
        beta = 1. - var_best_alpha

        for t in range(11, len_series - 1):
            s1 = var_best_alpha * var_series[t] + beta * s1
            s2 = var_best_alpha * s1 + beta * s2
            y[t+1] = round(2 * s1 - s2 + var_best_alpha / beta * (s1 - s2), 2)

        # save the predictions to a dictionary
        pred_dict_train[var_name].extend(y)
    print("End prediction at station {}.".format(station))

[1/102] Start time series prediction (DES) at station 408907...
    Speed...
    Flow...
    Occupancy...
End prediction at station 408907.
[2/102] Start time series prediction (DES) at station 400951...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400951.
[3/102] Start time series prediction (DES) at station 400057...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400057.
[4/102] Start time series prediction (DES) at station 400147...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400147.
[5/102] Start time series prediction (DES) at station 400343...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400343.
[6/102] Start time series prediction (DES) at station 401560...
    Speed...
    Flow...
    Occupancy...
End prediction at station 401560.
[7/102] Start time series prediction (DES) at station 400045...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400045.
[8/102] Start time s

End prediction at station 417665.
[60/102] Start time series prediction (DES) at station 412637...
    Speed...
    Flow...
    Occupancy...
End prediction at station 412637.
[61/102] Start time series prediction (DES) at station 417666...
    Speed...
    Flow...
    Occupancy...
End prediction at station 417666.
[62/102] Start time series prediction (DES) at station 408134...
    Speed...
    Flow...
    Occupancy...
End prediction at station 408134.
[63/102] Start time series prediction (DES) at station 400685...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400685.
[64/102] Start time series prediction (DES) at station 401003...
    Speed...
    Flow...
    Occupancy...
End prediction at station 401003.
[65/102] Start time series prediction (DES) at station 400898...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400898.
[66/102] Start time series prediction (DES) at station 400275...
    Speed...
    Flow...
    Occupancy...
End predictio

In [43]:
raw_train = raw_train.assign(Pred_Speed=pred_dict_train['Speed'], Pred_Flow=pred_dict_train['Flow'], Pred_Occupancy=pred_dict_train['Occupancy'])

## Using the tuned alphas to predict testing traffic variables

In [44]:
# initialization
# initialize prediction dictionary
pred_dict_test = dict()
for var_name in var_names:
    pred_dict_test[var_name] = []

for i, station in enumerate(stations):
    print("{} Start time series prediction (DES) at station {}...".format(fraction_msg(i+1, len(stations)), station))
    df_test_station = raw_test.loc[raw_test["Station ID"] == station]
    
    # formulate predictions of speed, flow and occupancy for the station
    for var_name in var_names:
        print("    {}...".format(var_name))
        var_series = df_test_station[var_name].values
        len_series = len(var_series)
        # initialize s1, s2, and y
        s1 = np.mean(var_series[:10])
        s2 = s1
        y = [0.] * len_series
        # get the best alpha
        var_best_alpha = best_alphas_df.loc[best_alphas_df["Station ID"] == station][var_name].values[0]
        beta = 1. - var_best_alpha

        num_batches = int(len_series / 288)
        for j in range(num_batches):
            base_idx = 288 * j
            for t in range(base_idx + 11, base_idx + 287):
                s1 = var_best_alpha * var_series[t] + beta * s1
                s2 = var_best_alpha * s1 + beta * s2
                y[t+1] = round(2 * s1 - s2 + var_best_alpha / beta * (s1 - s2), 2)

        # save the predictions to a dictionary
        pred_dict_test[var_name].extend(y)
    print("Finished forecasting at station {}.".format(station))
print("Finished forecasting for the test dataset.")

[1/102] Start time series prediction (DES) at station 408907...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 408907.
[2/102] Start time series prediction (DES) at station 400951...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400951.
[3/102] Start time series prediction (DES) at station 400057...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400057.
[4/102] Start time series prediction (DES) at station 400147...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400147.
[5/102] Start time series prediction (DES) at station 400343...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400343.
[6/102] Start time series prediction (DES) at station 401560...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 401560.
[7/102] Start time series prediction (DES) at station 400045...
    Speed...
    Flow...
    Occupancy...
Finished forecasti

Finished forecasting at station 412637.
[61/102] Start time series prediction (DES) at station 417666...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 417666.
[62/102] Start time series prediction (DES) at station 408134...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 408134.
[63/102] Start time series prediction (DES) at station 400685...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400685.
[64/102] Start time series prediction (DES) at station 401003...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 401003.
[65/102] Start time series prediction (DES) at station 400898...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400898.
[66/102] Start time series prediction (DES) at station 400275...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400275.
[67/102] Start time series prediction (DES) at station 400939...
    Speed...


In [45]:
raw_test = raw_test.assign(Pred_Speed=pred_dict_test['Speed'], Pred_Flow=pred_dict_test['Flow'], Pred_Occupancy=pred_dict_test['Occupancy'])

In [46]:
raw_test.tail(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,idx,LambdaMax,Sigma,Tau,Impact,Pred_Speed,Pred_Flow,Pred_Occupancy
6532125,401471,2017-06-28 23:45:00,0.7,23.0,67.3,06/28/2017,23:45,3201453,0.038882,0.0,0.0,0.06484,67.22,23.99,0.69
6532126,401471,2017-06-28 23:50:00,0.6,23.0,67.2,06/28/2017,23:50,3201454,0.038882,0.0,0.0,0.064408,67.29,21.89,0.67
6532127,401471,2017-06-28 23:55:00,0.6,19.0,67.1,06/28/2017,23:55,3201455,0.038882,0.0,0.0,0.063334,67.24,21.68,0.58


## Weekday/weekend Indicator

In [47]:
def date_to_day(date):
    # date: a date string in the format of "yyyy-mm-dd"
    # return: an int w/t Monday being 0 and Sunday being 6.
    if date.find('-') != -1:
        y, m, d = date.split('-')
    else:
        m, d, y = date.split('/')
    return dt.datetime(int(y), int(m), int(d)).weekday()

In [48]:
raw_train_dates = raw_train['Date'].values

In [49]:
def is_weekday(weekday):
    if weekday - 5 >= 0:
        return 0
    else:
        return 1

In [50]:
raw_train_weekday = [is_weekday(date_to_day(date)) for date in raw_train_dates]

In [51]:
raw_train['weekday'] = raw_train_weekday

In [52]:
raw_train.head(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,LambdaMax,Sigma,Tau,Impact,Pred_Speed,Pred_Flow,Pred_Occupancy,weekday
8640,408907,2017-06-01 00:00:00,0.5,22.0,69.5,06/01/2017,00:00,0.0,0.0,0.0,0.021267,0.0,0.0,0.0,1
8641,408907,2017-06-01 00:05:00,0.5,22.0,69.1,06/01/2017,00:05,0.0,0.0,0.0,0.017058,0.0,0.0,0.0,1
8642,408907,2017-06-01 00:10:00,0.5,23.0,68.9,06/01/2017,00:10,0.0,0.0,0.0,0.015338,0.0,0.0,0.0,1


# Train: SVM

Note that we need to rescale train and test dataset with the same factors.

## Train: feature vectors

### Train: feature vectors - negative

In [53]:
neg_times = raw_train['Time'].unique().tolist()[14:]

In [54]:
neg_sample_dates = np.random.choice(dates_train, 14, replace=False)

In [55]:
svm_incidents_sample = svm_pos_timestamps_train.loc[svm_pos_timestamps_train['Date'].isin(neg_sample_dates)]

In [56]:
X_neg_train = []
num_segments = len(road_segments)
count_date = 0
for neg_sample_date in neg_sample_dates:
    count_date += 1
    print("{} Negative feature vectors at date {}:".format(fraction_msg(count_date, len(neg_sample_dates)), neg_sample_date))
    
    for i, seg in enumerate(road_segments):
        B, E = seg
        df_neg_train_BE = raw_train.loc[((raw_train["Station ID"] == B) | (raw_train["Station ID"] == E)) & (raw_train["Date"] == neg_sample_date)]
        svm_incidents_sample_BE = svm_incidents_sample.loc[svm_incidents_sample['Upstream'] == B]
        sample_neg_times = np.random.choice(neg_times, 24)
        
        if (i+1) % 20 == 0:
            print("    {} Start constructing feature vectors for road segment s_{},{}...".format(fraction_msg(i+1, num_segments), B, E))
            print("        Total number of vectors: {}".format(len(sample_neg_times)))
        
        for neg_t in sample_neg_times:
            # check if current time is incident time
            if len(svm_incidents_sample_BE.loc[svm_incidents_sample_BE['Time'] == neg_t].index) != 0:
                continue

            feature_t = []
            neg_dt_timestamp = pd.Timestamp(neg_sample_date + ' ' + neg_t + ':00')

            B_lags = []
            for j in range(5):
                B_lags.append(neg_dt_timestamp - dt.timedelta(minutes=j*5))
            B_lags = list(map(lambda x: x.strftime('%H:%M') , B_lags))
            E_lags = B_lags[0:3]

            # upstream features
            for t_lag in B_lags:
                df_dt_lag = df_neg_train_BE.loc[(df_neg_train_BE["Station ID"] == B) & (df_neg_train_BE["Time"] == t_lag)]

                speed_B_t = df_dt_lag["Speed"].values[0]
                flow_B_t = df_dt_lag["Flow"].values[0]
                occ_B_t = df_dt_lag["Occupancy"].values[0]

                speed_pred_B_t = df_dt_lag["Pred_Speed"].values[0]
                flow_pred_B_t = df_dt_lag["Pred_Flow"].values[0]
                occ_pred_B_t = df_dt_lag["Pred_Occupancy"].values[0]
                
                lambda_max_B_t = df_dt_lag["LambdaMax"].values[0]
                sigma_B_t = df_dt_lag["Sigma"].values[0]
                tau_B_t = df_dt_lag["Tau"].values[0]
                impact_B_t = df_dt_lag["Impact"].values[0]

                weekday_B_t = df_dt_lag["weekday"].values[0]
                
                feature_t.extend([speed_B_t, flow_B_t, occ_B_t, 
                                  speed_pred_B_t, flow_pred_B_t, occ_pred_B_t, 
                                  speed_B_t - speed_pred_B_t, flow_B_t - flow_pred_B_t, occ_B_t - occ_pred_B_t, 
                                  lambda_max_B_t, sigma_B_t, tau_B_t, impact_B_t,
                                  weekday_B_t])

            # downstream features
            for t_lag in E_lags:
                df_dt_lag = df_neg_train_BE.loc[(df_neg_train_BE["Station ID"] == E) & (df_neg_train_BE["Time"] == t_lag)]

                speed_E_t = df_dt_lag["Speed"].values[0]
                flow_E_t = df_dt_lag["Flow"].values[0]
                occ_E_t = df_dt_lag["Occupancy"].values[0]

                speed_pred_E_t = df_dt_lag["Pred_Speed"].values[0]
                flow_pred_E_t = df_dt_lag["Pred_Flow"].values[0]
                occ_pred_E_t = df_dt_lag["Pred_Occupancy"].values[0]

                lambda_max_E_t = df_dt_lag["LambdaMax"].values[0]
                sigma_E_t = df_dt_lag["Sigma"].values[0]
                tau_E_t = df_dt_lag["Tau"].values[0]
                impact_E_t = df_dt_lag["Impact"].values[0]
                
                weekday_E_t = df_dt_lag["weekday"].values[0]
                
                feature_t.extend([speed_E_t, flow_E_t, occ_E_t, 
                                  speed_pred_E_t, flow_pred_E_t, occ_pred_E_t, 
                                  speed_E_t - speed_pred_E_t, flow_E_t - flow_pred_E_t, occ_E_t - occ_pred_E_t, 
                                  lambda_max_E_t, sigma_E_t, tau_E_t, impact_E_t, 
                                  weekday_E_t])
            
            X_neg_train.append(feature_t)

        if (i+1) % 20 == 0:
            print("        Feature vector at date and time {} {} is done.".format(neg_sample_date, neg_t))
            print("    ...Completed construction of feature vectors for road segment s_{},{}.".format(B, E))

[1/14] Negative feature vectors at date 06/09/2017:
    [20/101] Start constructing feature vectors for road segment s_402290,402292...
        Total number of vectors: 24
        Feature vector at date and time 06/09/2017 02:30 is done.
    ...Completed construction of feature vectors for road segment s_402290,402292.
    [40/101] Start constructing feature vectors for road segment s_400137,400716...
        Total number of vectors: 24
        Feature vector at date and time 06/09/2017 08:50 is done.
    ...Completed construction of feature vectors for road segment s_400137,400716.
    [60/101] Start constructing feature vectors for road segment s_412637,417666...
        Total number of vectors: 24
        Feature vector at date and time 06/09/2017 15:00 is done.
    ...Completed construction of feature vectors for road segment s_412637,417666.
    [80/101] Start constructing feature vectors for road segment s_410363,400360...
        Total number of vectors: 24
        Feature vecto

        Feature vector at date and time 06/10/2017 02:55 is done.
    ...Completed construction of feature vectors for road segment s_400923,401143.
[7/14] Negative feature vectors at date 06/19/2017:
    [20/101] Start constructing feature vectors for road segment s_402290,402292...
        Total number of vectors: 24
        Feature vector at date and time 06/19/2017 05:15 is done.
    ...Completed construction of feature vectors for road segment s_402290,402292.
    [40/101] Start constructing feature vectors for road segment s_400137,400716...
        Total number of vectors: 24
        Feature vector at date and time 06/19/2017 22:35 is done.
    ...Completed construction of feature vectors for road segment s_400137,400716.
    [60/101] Start constructing feature vectors for road segment s_412637,417666...
        Total number of vectors: 24
        Feature vector at date and time 06/19/2017 17:20 is done.
    ...Completed construction of feature vectors for road segment s_412637,

    [100/101] Start constructing feature vectors for road segment s_400923,401143...
        Total number of vectors: 24
        Feature vector at date and time 06/06/2017 05:45 is done.
    ...Completed construction of feature vectors for road segment s_400923,401143.
[13/14] Negative feature vectors at date 06/30/2017:
    [20/101] Start constructing feature vectors for road segment s_402290,402292...
        Total number of vectors: 24
        Feature vector at date and time 06/30/2017 10:15 is done.
    ...Completed construction of feature vectors for road segment s_402290,402292.
    [40/101] Start constructing feature vectors for road segment s_400137,400716...
        Total number of vectors: 24
        Feature vector at date and time 06/30/2017 10:50 is done.
    ...Completed construction of feature vectors for road segment s_400137,400716.
    [60/101] Start constructing feature vectors for road segment s_412637,417666...
        Total number of vectors: 24
        Feature vec

In [57]:
len(X_neg_train)

32298

In [58]:
y_neg_train = [-1] * len(X_neg_train)

### Train: feature vectors - positive

In [59]:
working_time = raw_train['Time'].unique().tolist()[14:]

In [60]:
svm_pos_timestamps_train = svm_pos_timestamps_train.loc[svm_pos_timestamps_train['Time'].isin(working_time)]

In [61]:
X_pos_train = []
for i, seg in enumerate(road_segments):
    B, E = seg
    print("{} Start constructing positive feature vectors for road segment s_{},{}... ".format(fraction_msg(i+1, len(road_segments)), B, E))
    progress_count = 0
    
    # construct segment-specific pos_times
    pos_times = []
    df_seg_incidents = svm_pos_timestamps_train.loc[svm_pos_timestamps_train["Upstream"] == B]
    seg_dates = df_seg_incidents['Date'].values.tolist()
    seg_times = df_seg_incidents['Time'].values.tolist()
    num_seg_instances = len(seg_dates)
    for i in range(num_seg_instances):
        pos_times.append(tuple([seg_dates[i], seg_times[i]]))
    
    # select the relevant training data for segment B, E 
    df_train_BE = raw_train.loc[(raw_train["Station ID"] == B) | (raw_train["Station ID"] == E)]
    
    
    print("    Total number of vectors: {}".format(num_seg_instances))
    for pos_dt in pos_times:
        pos_d, pos_t = pos_dt
        feature_t = []
        pos_dt_timestamp = pd.Timestamp(pos_d + ' ' + pos_t + ':00')

        # upstream and downstream time lags
        B_lags = []
        for j in range(5):
            B_lags.append(pos_dt_timestamp - dt.timedelta(minutes=j*5))
        B_lags = list(map(lambda x: (x.strftime('%m/%d/%Y'), x.strftime('%H:%M')) , B_lags))
        E_lags = B_lags[0:3]

        # upstream features
        for dt_lag in B_lags:
            d_lag, t_lag = dt_lag
            df_dt_lag = df_train_BE.loc[(df_train_BE["Station ID"] == B) & (df_train_BE["Date"] == d_lag) & (df_train_BE["Time"] == t_lag)]
            if df_dt_lag.empty:
                print(d_lag, t_lag)
            
            speed_B_t = df_dt_lag["Speed"].values[0]
            flow_B_t = df_dt_lag["Flow"].values[0]
            occ_B_t = df_dt_lag["Occupancy"].values[0]

            speed_pred_B_t = df_dt_lag["Pred_Speed"].values[0]
            flow_pred_B_t = df_dt_lag["Pred_Flow"].values[0]
            occ_pred_B_t = df_dt_lag["Pred_Occupancy"].values[0]
            
            lambda_max_B_t = df_dt_lag["LambdaMax"].values[0]
            sigma_B_t = df_dt_lag["Sigma"].values[0]
            tau_B_t = df_dt_lag["Tau"].values[0]
            impact_B_t = df_dt_lag["Impact"].values[0]
            
            weekday_B_t = df_dt_lag["weekday"].values[0]

            feature_t.extend([speed_B_t, flow_B_t, occ_B_t, 
                              speed_pred_B_t, flow_pred_B_t, occ_pred_B_t, 
                              speed_B_t - speed_pred_B_t, flow_B_t - flow_pred_B_t, occ_B_t - occ_pred_B_t, 
                              lambda_max_B_t, sigma_B_t, tau_B_t, impact_B_t,
                              weekday_B_t])

        # downstream features
        for dt_lag in E_lags:
            d_lag, t_lag = dt_lag
            df_dt_lag = df_train_BE.loc[(df_train_BE["Station ID"] == E) & (df_train_BE["Date"] == d_lag) & (df_train_BE["Time"] == t_lag)]

            speed_E_t = df_dt_lag["Speed"].values[0]
            flow_E_t = df_dt_lag["Flow"].values[0]
            occ_E_t = df_dt_lag["Occupancy"].values[0]

            speed_pred_E_t = df_dt_lag["Pred_Speed"].values[0]
            flow_pred_E_t = df_dt_lag["Pred_Flow"].values[0]
            occ_pred_E_t = df_dt_lag["Pred_Occupancy"].values[0]
            
            lambda_max_E_t = df_dt_lag["LambdaMax"].values[0]
            sigma_E_t = df_dt_lag["Sigma"].values[0]
            tau_E_t = df_dt_lag["Tau"].values[0]
            impact_E_t = df_dt_lag["Impact"].values[0]
            
            weekday_E_t = df_dt_lag["weekday"].values[0]
            
            feature_t.extend([speed_E_t, flow_E_t, occ_E_t, 
                              speed_pred_E_t, flow_pred_E_t, occ_pred_E_t, 
                              speed_E_t - speed_pred_E_t, flow_E_t - flow_pred_E_t, occ_E_t - occ_pred_E_t, 
                              lambda_max_E_t, sigma_E_t, tau_E_t, impact_E_t,
                              weekday_E_t,])
        
        X_pos_train.append(feature_t)
        
        progress_count += 1
        if progress_count % 100 == 0:
            print("    {} Feature vector at date and time {} {} is done.".format(fraction_msg(progress_count, num_seg_instances), pos_d, pos_t))

print("...Completed construction of feature vectors for road segment s_{},{}.".format(B, E))

[1/101] Start constructing positive feature vectors for road segment s_408907,400951... 
    Total number of vectors: 0
[2/101] Start constructing positive feature vectors for road segment s_400951,400057... 
    Total number of vectors: 41
[3/101] Start constructing positive feature vectors for road segment s_400057,400147... 
    Total number of vectors: 42
[4/101] Start constructing positive feature vectors for road segment s_400147,400343... 
    Total number of vectors: 32
[5/101] Start constructing positive feature vectors for road segment s_400343,401560... 
    Total number of vectors: 32
[6/101] Start constructing positive feature vectors for road segment s_401560,400045... 
    Total number of vectors: 14
[7/101] Start constructing positive feature vectors for road segment s_400045,400122... 
    Total number of vectors: 17
[8/101] Start constructing positive feature vectors for road segment s_400122,401541... 
    Total number of vectors: 5
[9/101] Start constructing positiv

[66/101] Start constructing positive feature vectors for road segment s_400275,400939... 
    Total number of vectors: 18
[67/101] Start constructing positive feature vectors for road segment s_400939,400180... 
    Total number of vectors: 0
[68/101] Start constructing positive feature vectors for road segment s_400180,400529... 
    Total number of vectors: 0
[69/101] Start constructing positive feature vectors for road segment s_400529,400990... 
    Total number of vectors: 0
[70/101] Start constructing positive feature vectors for road segment s_400990,400515... 
    Total number of vectors: 0
[71/101] Start constructing positive feature vectors for road segment s_400515,400252... 
    Total number of vectors: 103
    [100/103] Feature vector at date and time 06/29/2017 19:20 is done.
[72/101] Start constructing positive feature vectors for road segment s_400252,400788... 
    Total number of vectors: 66
[73/101] Start constructing positive feature vectors for road segment s_40078

In [62]:
len(X_pos_train)

2637

In [63]:
y_pos_train = [1] * len(X_pos_train)

## Train: Merging feature vectors together

In [64]:
X_neg_train = np.array(X_neg_train)

In [65]:
X_neg_train_balanced = X_neg_train[np.random.choice(len(X_neg_train), len(X_pos_train), replace=False)].tolist()

In [66]:
y_neg_train_balanced = [-1] * len(X_neg_train_balanced)

In [67]:
X_train = X_neg_train_balanced + X_pos_train

In [68]:
y_train = y_neg_train_balanced + y_pos_train

In [69]:
len(X_train), len(y_train)

(5274, 5274)

## Test: feature vectors

In [70]:
raw_test = raw_test.assign(Diff_Speed=lambda x:x['Speed']-x['Pred_Speed'], Diff_Flow=lambda x: x['Flow']-x['Pred_Flow'], Diff_Occupancy=lambda x: x['Occupancy']-x['Pred_Occupancy'])

In [71]:
raw_test.tail(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,idx,LambdaMax,Sigma,Tau,Impact,Pred_Speed,Pred_Flow,Pred_Occupancy,Diff_Speed,Diff_Flow,Diff_Occupancy
6532125,401471,2017-06-28 23:45:00,0.7,23.0,67.3,06/28/2017,23:45,3201453,0.038882,0.0,0.0,0.06484,67.22,23.99,0.69,0.08,-0.99,0.01
6532126,401471,2017-06-28 23:50:00,0.6,23.0,67.2,06/28/2017,23:50,3201454,0.038882,0.0,0.0,0.064408,67.29,21.89,0.67,-0.09,1.11,-0.07
6532127,401471,2017-06-28 23:55:00,0.6,19.0,67.1,06/28/2017,23:55,3201455,0.038882,0.0,0.0,0.063334,67.24,21.68,0.58,-0.14,-2.68,0.02


In [72]:
raw_test_dates = raw_test['Date'].values

In [73]:
raw_test_weekday = [is_weekday(date_to_day(date)) for date in raw_test_dates]

In [74]:
raw_test['weekday'] = raw_test_weekday

In [75]:
feature_names = ['Speed', 'Flow', 'Occupancy', 
                 'Pred_Speed', 'Pred_Flow', 'Pred_Occupancy', 
                 'Diff_Speed', 'Diff_Flow', 'Diff_Occupancy']

In [76]:
feature_names.extend(['LambdaMax', 'Sigma', 'Tau', 'Impact'])

In [77]:
feature_names.extend(['weekday'])

In [78]:
num_features = len(feature_names)
k_B = 4
k_E = 2

In [79]:
X_test = []
y_test = []
for seg_idx, seg in enumerate(road_segments):
    B, E = seg
    print("{} Constructing feature vector for segment s_{},{}...".format(fraction_msg(seg_idx+1, len(road_segments)), B, E))
    df_BE_test = raw_test.loc[((raw_test["Station ID"] == B) | (raw_test["Station ID"] == E))]
    df_incidents_BE_test = svm_pos_timestamps_test.loc[svm_pos_timestamps_test["Upstream"] == B]
    incidents_BE_date = df_incidents_BE_test["Date"].values
    incidents_BE_time = df_incidents_BE_test["Time"].values
    
    incidents_BE_dt = set()
    for i in range(len(incidents_BE_date)):
        incidents_BE_dt.add(incidents_BE_date[i] + ' ' + incidents_BE_time[i])
    
    # change to access by indices, to make program faster
    features_BE_dict = dict()
    features_BE_dict[B] = dict()
    features_BE_dict[E] = dict()
    for feature_name in feature_names:
        features_BE_dict[B][feature_name] = df_BE_test.loc[df_BE_test["Station ID"] == B][feature_name].values.tolist()
        features_BE_dict[E][feature_name] = df_BE_test.loc[df_BE_test["Station ID"] == E][feature_name].values.tolist()
    
    total_count = len(dates_test) * len(neg_times)
    count = 0
    print("    Total number of instances: {}".format(total_count)) 
    
    for i, d in enumerate(dates_test):
        for j, t in enumerate(neg_times):
            # construct vector Z(s_BE, dt)
            feature_BE_t = [0.] * (k_B + k_E + 2) * num_features
            base_idx = i * 288 + 14 + j
            for k, feature_name in enumerate(feature_names):
                # feature_k_B_t: [t-4, t-3, ..., t] -> need to be reversed and made consistent with order of SVM features. Same to E.
                feature_k_B_t = features_BE_dict[B][feature_name][base_idx-k_B:base_idx+1]
                feature_k_E_t = features_BE_dict[E][feature_name][base_idx-k_E:base_idx+1]
                feature_k_B_t.reverse()
                feature_k_E_t.reverse()
                feature_k_BE_t = feature_k_B_t + feature_k_E_t
                feature_BE_t[k:(k_B + k_E + 2)*num_features:num_features] = feature_k_BE_t
            
            X_test.append(feature_BE_t)
            # label data
            if d + ' ' + t in incidents_BE_dt:
                y_test.append(1)
            else:
                y_test.append(-1)
        count += 1
        if count % 50 == 0:
            print("    Progress: {}" + fraction_msg(count * len(neg_times), total_count))
    print("...Finished construction for segment s_{},{}.".format(B, E))

[1/101] Constructing feature vector for segment s_408907,400951...
    Total number of instances: 1918
...Finished construction for segment s_408907,400951.
[2/101] Constructing feature vector for segment s_400951,400057...
    Total number of instances: 1918
...Finished construction for segment s_400951,400057.
[3/101] Constructing feature vector for segment s_400057,400147...
    Total number of instances: 1918
...Finished construction for segment s_400057,400147.
[4/101] Constructing feature vector for segment s_400147,400343...
    Total number of instances: 1918
...Finished construction for segment s_400147,400343.
[5/101] Constructing feature vector for segment s_400343,401560...
    Total number of instances: 1918
...Finished construction for segment s_400343,401560.
[6/101] Constructing feature vector for segment s_401560,400045...
    Total number of instances: 1918
...Finished construction for segment s_401560,400045.
[7/101] Constructing feature vector for segment s_400045,4

...Finished construction for segment s_400284,400041.
[56/101] Constructing feature vector for segment s_400041,408133...
    Total number of instances: 1918
...Finished construction for segment s_400041,408133.
[57/101] Constructing feature vector for segment s_408133,408135...
    Total number of instances: 1918
...Finished construction for segment s_408133,408135.
[58/101] Constructing feature vector for segment s_408135,417665...
    Total number of instances: 1918
...Finished construction for segment s_408135,417665.
[59/101] Constructing feature vector for segment s_417665,412637...
    Total number of instances: 1918
...Finished construction for segment s_417665,412637.
[60/101] Constructing feature vector for segment s_412637,417666...
    Total number of instances: 1918
...Finished construction for segment s_412637,417666.
[61/101] Constructing feature vector for segment s_417666,408134...
    Total number of instances: 1918
...Finished construction for segment s_417666,408134

In [80]:
len(X_test), len(y_test)

(193718, 193718)

## SVM preprocessing: merge train/test and normalize

In [81]:
from sklearn import preprocessing

In [82]:
X_normalized = preprocessing.scale(X_train + X_test)

In [83]:
X_train_normalized = X_normalized[:len(X_train)]
X_test_normalized = X_normalized[len(X_train):]

## SVM training

In [84]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [85]:
param_grid = {
    'C': [2 ** i for i in range(-3, 16, 2)],
    'gamma': [2 ** i for i in range(-15, 4, 2)]
}

In [86]:
svm_grid_search = GridSearchCV(SVC(kernel='rbf'), n_jobs=8, 
                               param_grid=param_grid, cv=5, 
                               scoring='accuracy', verbose=2)

In [87]:
svm_grid_search.fit(X_train_normalized, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   36.2s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  2.9min
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:  6.9min
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed: 13.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'C': [0.125, 0.5, 2, 8, 32, 128, 512, 2048, 8192, 32768], 'gamma': [3.0517578125e-05, 0.0001220703125, 0.00048828125, 0.001953125, 0.0078125, 0.03125, 0.125, 0.5, 2, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=2)

In [88]:
svm_grid_search.best_params_

{'C': 128, 'gamma': 3.0517578125e-05}

In [89]:
svm_grid_search.best_score_

0.6103526734926052

In [90]:
len(svm_grid_search.best_estimator_.support_vectors_)

4274

In [91]:
svm_grid_search.best_estimator_

SVC(C=128, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=3.0517578125e-05,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [92]:
y_train_pred = svm_grid_search.predict(X_train_normalized)

In [93]:
accuracy_score(y_train_pred, y_train)

0.6399317406143344

In [94]:
len(X_test_normalized)

193718

In [95]:
# parallelize prediction
def predict_func(X, clf):
    print("Process {} is predicting ...".format(mp.current_process().pid))
    return clf.predict(X)

In [96]:
pred_pool = mp.Pool(8)
num_instances = int(np.ceil(len(X_test_normalized) / 8))
y_test_pred_jobs = [pred_pool.apply_async(predict_func, args=(X_test_normalized[i*num_instances:(i+1)*num_instances, :], svm_grid_search)) for i in range(0, 8)]
pred_pool.close()
pred_pool.join()

Process 19812 is predicting ...
Process 19813 is predicting ...
Process 19814 is predicting ...
Process 19815 is predicting ...
Process 19816 is predicting ...
Process 19817 is predicting ...
Process 19818 is predicting ...
Process 19819 is predicting ...


In [97]:
y_test_pred = []
for y_test_pred_job in y_test_pred_jobs:
    y_test_pred.extend(y_test_pred_job.get())

In [98]:
len(X_test_normalized), len(y_test_pred)

(193718, 193718)

In [99]:
accuracy_score(y_test_pred, y_test)

0.8168213588824993

In [100]:
num_dt_segments = int(len(y_test_pred) / (288-14))

## Decision Tree Learning

In [101]:
from sklearn.tree import DecisionTreeClassifier

In [102]:
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=None)

In [103]:
clf_gini.fit(X_train_normalized, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [104]:
y_train_pred_gini = clf_gini.predict(X_train_normalized)

In [105]:
accuracy_score(y_train_pred_gini, y_train)

1.0

In [106]:
y_test_pred_gini = clf_gini.predict(X_test_normalized)

In [107]:
accuracy_score(y_test_pred_gini, y_test)

0.6583848687267059

# Evaluation Metrics

Rush hours: 6:30 AM - 9:00 AM (idx 78 - idx 108), 3:30 PM - 6:30 PM (idx 186 - idx 222).

In [108]:
rush_hours_idx = list(range(78, 109)) + list(range(186, 223))

In [109]:
PT_thresholds = [2 ** i for i in range(5)]

## Detection rate (DR)

In [110]:
def DR(y_test, y_test_pred, PT_thresholds, num_dt_segments):
    DRs = []
    for PT_threshold in PT_thresholds:
        num_detected_incidents = 0
        total_num_incidents = 0
        for i in range(num_dt_segments):
            base_idx = i * 274
            max_base_offset = (i + 1) * 274
            start_idx = base_idx
            end_idx = start_idx
            while start_idx < max_base_offset:
                while start_idx < max_base_offset and y_test[start_idx] == -1:
                    start_idx += 1
                if start_idx == max_base_offset:
                    break
                # an incident happens
                # time span of the incident
                end_idx = start_idx
                while end_idx < max_base_offset and y_test[end_idx] == 1:
                    end_idx += 1
                if (end_idx - start_idx >= PT_threshold):
                    total_num_incidents += 1
                    # the incident is detected
                    if 1 in y_test_pred[start_idx:end_idx]:
                        num_detected_incidents += 1

                start_idx = end_idx + 1
        DRs.append(round(num_detected_incidents / total_num_incidents * 100, 2))
        print("PT_{}".format(PT_threshold))
        print("# of detected incidents: {}".format(num_detected_incidents))
        print("Total # of incidents: {}".format(total_num_incidents))
        print("Detection rate: {}".format(DRs[-1]))

### SVM

In [111]:
DR(y_test, y_test_pred, PT_thresholds[0:3], num_dt_segments)

PT_1
# of detected incidents: 43
Total # of incidents: 71
Detection rate: 60.56
PT_2
# of detected incidents: 42
Total # of incidents: 68
Detection rate: 61.76
PT_4
# of detected incidents: 34
Total # of incidents: 54
Detection rate: 62.96


### Decision Tree with Gini Index

In [112]:
DR(y_test, y_test_pred_gini, PT_thresholds[0:3], num_dt_segments)

PT_1
# of detected incidents: 59
Total # of incidents: 71
Detection rate: 83.1
PT_2
# of detected incidents: 56
Total # of incidents: 68
Detection rate: 82.35
PT_4
# of detected incidents: 47
Total # of incidents: 54
Detection rate: 87.04


## Mean time to detect (MTTD)

In [113]:
def MTTD(y_test, y_test_pred, PT_thresholds, num_dt_segments):
    MTTDs = []
    for PT_threshold in PT_thresholds:
        h = 0
        sum_ttd = 0
        for i in range(num_dt_segments):
            base_idx = i * 274
            max_base_offset = (i + 1) * 274
            start_idx = base_idx
            end_idx = start_idx
            while start_idx < max_base_offset:
                while start_idx < max_base_offset and y_test[start_idx] == -1:
                    start_idx += 1
                if start_idx == max_base_offset:
                    break
                # an incident happens
                # time span of the incident
                end_idx = start_idx
                while end_idx < max_base_offset and y_test[end_idx] == 1:
                    end_idx += 1

                if end_idx - start_idx >= PT_threshold:
                    # the incident is detected
                    if 1 in y_test_pred[start_idx:end_idx]:
                        h += 1
                        incident_idx = start_idx
                        detection_idx = incident_idx
                        while y_test_pred[detection_idx] == -1:
                            detection_idx += 1
                        sum_ttd += (detection_idx - incident_idx) * 5

                start_idx = end_idx + 1
        MTTDs.append(round(sum_ttd / h, 2))

        print("PT_{}".format(PT_threshold))
        print("Total number of detected incidents: {}".format(h))
        print("Total time of detection lags (min): {}".format(sum_ttd))
        print("Mean time to detect (MTTD): {}".format(MTTDs[-1]))

### SVM

In [114]:
MTTD(y_test, y_test_pred, PT_thresholds[0:3], num_dt_segments)

PT_1
Total number of detected incidents: 43
Total time of detection lags (min): 335
Mean time to detect (MTTD): 7.79
PT_2
Total number of detected incidents: 42
Total time of detection lags (min): 335
Mean time to detect (MTTD): 7.98
PT_4
Total number of detected incidents: 34
Total time of detection lags (min): 335
Mean time to detect (MTTD): 9.85


### Decision Tree with Gini Index

In [115]:
MTTD(y_test, y_test_pred_gini, PT_thresholds[0:3], num_dt_segments)

PT_1
Total number of detected incidents: 59
Total time of detection lags (min): 485
Mean time to detect (MTTD): 8.22
PT_2
Total number of detected incidents: 56
Total time of detection lags (min): 485
Mean time to detect (MTTD): 8.66
PT_4
Total number of detected incidents: 47
Total time of detection lags (min): 475
Mean time to detect (MTTD): 10.11


## False positive rate

$$ \frac{FP}{N} = \frac{FP}{FP + TN}, $$ where $FP$ = False positive, and $TN$ = True negative.

In [116]:
def FP(y_test, y_test_pred):
    num_false_positives = 0
    num_true_negatives = 0
    total_num_detections = len(y_test_pred)
    for i in range(total_num_detections):
        if y_test_pred[i] == 1 and y_test[i] == -1:
            num_false_positives += 1
        elif y_test_pred[i] == -1 and y_test[i] == -1:
            num_true_negatives += 1
    print("The false positive rate is {}.".format(round(num_false_positives / (num_false_positives + num_true_negatives) * 100, 4)))

In [117]:
FP(y_test, y_test_pred)

The false positive rate is 18.1681.


In [118]:
FP(y_test, y_test_pred_gini)

The false positive rate is 34.0683.


## False alarm rate

In [119]:
def false_alarm_rate(y_test, y_test_pred):
    num_false_alarms = 0
    total_num_detections = len(y_test_pred)
    for i in range(total_num_detections):
        if y_test_pred[i] != y_test[i]:
            num_false_alarms += 1
    print("The false alarm rate is {}.".format(round(num_false_alarms / total_num_detections * 100, 4)))

In [120]:
false_alarm_rate(y_test, y_test_pred)

The false alarm rate is 18.3179.


In [121]:
false_alarm_rate(y_test, y_test_pred_gini)

The false alarm rate is 34.1615.
