In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
%matplotlib inline

In [2]:
base_path = 'data/I88N-processed/'

# Initialization

## Sample dates, split train/test dataset

In [3]:
def date_to_day(date):
    # date: a date string in the format of "yyyy-mm-dd"
    # return: an int w/t Monday being 0 and Sunday being 6.
    if date.find('-') != -1:
        y, m, d = date.split('-')
    else:
        m, d, y = date.split('/')
    return dt.datetime(int(y), int(m), int(d)).weekday()

In [4]:
dates = pd.read_csv(base_path + 'available_dates.csv')
dates = np.array(dates['0'].values.tolist())
dates = np.array(list(map(lambda x: x.split('-')[1] + '/' + x.split('-')[2] + '/' + x.split('-')[0], dates)))

We want to sample dates in June.

In [5]:
dates = dates[(dates > '05/31/2017') & (dates < '07/01/2017')]

In [6]:
dates_weekend = [date for date in dates if date_to_day(date) >= 5]
dates_weekday = list(set(dates).difference(set(dates_weekend)))

In [7]:
len(dates_weekend), len(dates_weekday)

(8, 21)

In [8]:
dates_train = dates_weekday[:14] + dates_weekend[:3]
dates_test = dates_weekday[14:] + dates_weekend[3:]
dates_train.sort()
dates_test.sort()

In [9]:
len(dates), len(dates_train), len(dates_test), dates_train[0:3], dates_test[0:3]

(29,
 17,
 12,
 ['06/01/2017', '06/02/2017', '06/03/2017'],
 ['06/05/2017', '06/07/2017', '06/11/2017'])

## Loading severity data

In [10]:
severity_data = pd.read_csv(base_path + 'severity_params.csv')
severity_data.rename(columns={'Unnamed: 0':'datetime'}, inplace=True)

In [11]:
sev_datetimes = severity_data['datetime'].values

In [12]:
sev_dates = []
sev_times = []
for x in sev_datetimes:
    d, t = x.split(' ')
    sev_dates.append(d)
    sev_times.append(t)

In [13]:
severity_data['Date'] = sev_dates
severity_data['Time'] = sev_times

In [14]:
severity_data = severity_data.loc[~severity_data['Date'].isin(['2017-06-15'])]

In [15]:
lambda_max = severity_data['LambdaMax'].values
sigma = severity_data['Sigma'].values
tau = severity_data['Tau'].values
impact = severity_data['Impact'].values

In [16]:
severity_data.head(3)

Unnamed: 0,datetime,ID,LambdaMax,Sigma,Tau,Impact,Incident,Date,Time
0,2017-06-01 00:00:00,408907,,,,0.021267,0.0,2017-06-01,00:00:00
1,2017-06-01 00:05:00,408907,,,,0.017058,0.0,2017-06-01,00:05:00
2,2017-06-01 00:10:00,408907,,,0.0,0.015338,0.0,2017-06-01,00:10:00


In [17]:
lambda_max = [0 if np.isnan(x) else x for x in lambda_max]
sigma = [0 if np.isnan(x) else x for x in sigma]
tau = [0 if np.isnan(x) else x for x in tau]

In [18]:
severity_data['LambdaMax'] = lambda_max
severity_data['Sigma'] = sigma
severity_data['Tau'] = tau

## Loading speed, flow, occupancy, and stations

In [19]:
raw = pd.read_csv(base_path + 'concat_no_holes/concat.csv')

In [20]:
# select raw that is sampled
raw_all = raw.loc[raw['Date'].isin(dates)]

In [21]:
raw_all['LambdaMax'] = lambda_max
raw_all['Sigma'] = sigma
raw_all['Tau'] = tau
raw_all['Impact'] = impact

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [22]:
raw_test = raw_all.loc[raw['Date'].isin(dates_test)]
stations = np.array(raw_all['Station ID'].unique().tolist())

In [23]:
raw_all.head(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,idx,LambdaMax,Sigma,Tau,Impact
8640,408907,2017-06-01 00:00:00,0.5,22.0,69.5,06/01/2017,00:00,42322,0.0,0.0,0.0,0.021267
8641,408907,2017-06-01 00:05:00,0.5,22.0,69.1,06/01/2017,00:05,42323,0.0,0.0,0.0,0.017058
8642,408907,2017-06-01 00:10:00,0.5,23.0,68.9,06/01/2017,00:10,42324,0.0,0.0,0.0,0.015338


In [24]:
# special construction of raw_train, because the dates are sampled with replacement
raw_train = pd.DataFrame()
duplicate_id = 0
for i in range(0, len(dates_train)):
    if i > 0:
        if dates_train[i] == dates_train[i-1]:
            duplicate_id += 1
        else:
            duplicate_id = 0
    df_date = raw_all.loc[raw_all['Date'] == dates_train[i]]
    df_date = df_date.assign(duplicateIdx=duplicate_id)
    raw_train = raw_train.append(df_date)
sorterIdx = dict( zip(stations, range(len(stations))) )
raw_train['stationSorterIdx'] = raw_train['Station ID'].map(sorterIdx)
raw_train = raw_train.sort_values(['stationSorterIdx', 'duplicateIdx', 'datetime'], ascending=[True, True, True])
raw_train.drop(['duplicateIdx', 'stationSorterIdx', 'idx'], axis=1, inplace=True)

In [25]:
len(raw_train.index)

499392

In [26]:
var_names = ['Speed', 'Flow', 'Occupancy']

### Construct road segments

In [27]:
road_segments = list()
for i in range(len(stations) - 1):
    road_segments.append(tuple([stations[i], stations[i+1]]))

## Loading incidents

In [28]:
raw_incidents = pd.read_csv(base_path + 'valid_incidents.csv')

In [29]:
raw_incidents_all = raw_incidents.loc[raw_incidents['Date'].isin(dates)]
raw_incidents_train = raw_incidents_all.loc[raw_incidents_all['Date'].isin(dates_train)]
raw_incidents_test = raw_incidents_all.loc[raw_incidents_all['Date'].isin(dates_test)]

In [30]:
svm_pos_timestamps = pd.read_csv(base_path + 'svm_pos_instances.csv')

In [31]:
svm_pos_timestamps_train = svm_pos_timestamps.loc[svm_pos_timestamps['Date'].isin(dates_train)]
svm_pos_timestamps_test = svm_pos_timestamps.loc[svm_pos_timestamps['Date'].isin(dates_test)]

In [32]:
svm_pos_timestamps.head(3)

Unnamed: 0,Upstream,Downstream,Date,Time
0,408907,400951,01/22/2017,20:30
1,408907,400951,01/22/2017,20:35
2,408907,400951,01/22/2017,20:40


In [33]:
svm_incident_dates_train = svm_pos_timestamps_train['Date'].unique().tolist()
svm_normal_dates_train = list(set(dates_train).difference(svm_incident_dates_train))

In [34]:
len(svm_incident_dates_train), len(svm_normal_dates_train)

(17, 0)

## Progress message formatting

In [35]:
def fraction_msg(present, total):
    return '[{}/{}]'.format(present, total)

# Train: TSA-DES forecasting

In [36]:
def DES_rmse(alpha, var_series):
    len_series = len(var_series)
    
    beta = round(1. - alpha, 3)

    sse = 0.
    s1 = np.mean(var_series[:10])
    s2 = s1
    
    for i in range(11, len_series - 1):
        s1 = alpha * var_series[i] + beta * s1
        s2 = alpha * s1 + beta * s2
        y_next = 2 * s1 - s2 + alpha / beta * (s1 - s2)
        sse += (var_series[i+1] - y_next) ** 2
    
    return np.sqrt( sse / (len_series - 12) )

## Tune best alphas for each station

In [37]:
import multiprocessing as mp

In [38]:
# input: stations, raw training data (includ. incidents), rule to update alphas
# output: a dictionary containing stations, and stations' best alphas
def compute_best_alphas(stations, raw_train, raw_incidents_train, dates_train, num_grids, DES_rmse, fraction_msg):
    best_alphas = {
    'Station ID': [],
    'Speed': [],
    'Flow': [],
    'Occupancy': []
    }
    pid = mp.current_process().pid
    for i, station in enumerate(stations):
        best_alphas['Station ID'].append(station)

        # update current training station dataframe, the training data is normal day's data
        abnormal_dates_station = raw_incidents_train.loc[(raw_incidents_train['Upstream'] == station) | (raw_incidents_train['Downstream'] == str(station))]['Date'].unique()
        normal_dates_train = np.array(list(set(dates_train).difference(set(abnormal_dates_station))))
        df_train_station = raw_train.loc[(raw_train['Station ID'] == station) & (raw_train['Date'].isin(normal_dates_train))]

        print("{} {} Tuning alphas for station {}...".format(pid, fraction_msg(i+1, len(stations)), station))
        for var_name in var_names:
            # print("    " + var_name + "...")
            var_series = df_train_station[var_name].values
            len_series = len(var_series)

            # setting up alphas
            alphas = np.arange(num_grids) * 1. / num_grids

            # save the historical best alpha by rmse
            best_rmse = float("inf")
            best_alpha = 0.

            # for each alpha, perform exponential smoothing, and compute RMSE
            for alpha in alphas:
                rmse = DES_rmse(alpha, var_series)

                # compare, and decide whether to update best alpha
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_alpha = alpha

            # finally, save the best alpha for the variable at this station
            best_alphas[var_name].append(best_alpha)

        # print trained alphas for each station
        # print(best_alphas['Station ID'][i], best_alphas['Speed'][i], best_alphas['Flow'][i], best_alphas['Occupancy'][i])
    print("Process {} has finished alpha tuning.".format(pid))
    return best_alphas

In [39]:
pool = mp.Pool(processes=8)
num_grids = 100
results = [pool.apply_async(compute_best_alphas, args=(stations[13 * i: 13 * i + 13], raw_train, raw_incidents_train, dates_train, num_grids, DES_rmse, fraction_msg)) for i in range(0, 8)]
pool.close()
pool.join()

28966 [1/13] Tuning alphas for station 408907...
28967 [1/13] Tuning alphas for station 400088...
28968 [1/13] Tuning alphas for station 408755...
28969 [1/13] Tuning alphas for station 400137...
28970 [1/13] Tuning alphas for station 400611...
28971 [1/13] Tuning alphas for station 400275...
28972 [1/13] Tuning alphas for station 400333...
28973 [1/11] Tuning alphas for station 400980...
28967 [2/13] Tuning alphas for station 402288...
28966 [2/13] Tuning alphas for station 400951...
28968 [2/13] Tuning alphas for station 402802...
28969 [2/13] Tuning alphas for station 400716...
28971 [2/13] Tuning alphas for station 400939...
28970 [2/13] Tuning alphas for station 400928...
28972 [2/13] Tuning alphas for station 410363...
28973 [2/11] Tuning alphas for station 401333...
28966 [3/13] Tuning alphas for station 400057...
28967 [3/13] Tuning alphas for station 413026...
28969 [3/13] Tuning alphas for station 401545...
28968 [3/13] Tuning alphas for station 408756...
28973 [3/11] Tuning 

In [40]:
exec_results = []
for proc in results:
    exec_results.append(proc.get())

In [41]:
best_alphas = {
    'Station ID': [],
    'Speed': [],
    'Flow': [],
    'Occupancy': []
}
for dict_best_alphas in exec_results:
    for key in best_alphas.keys():
        best_alphas[key].extend(dict_best_alphas[key])

In [42]:
best_alphas

{'Station ID': [408907,
  400951,
  400057,
  400147,
  400343,
  401560,
  400045,
  400122,
  401541,
  402281,
  402283,
  402285,
  402286,
  400088,
  402288,
  413026,
  401464,
  401489,
  401538,
  402290,
  402292,
  401643,
  402800,
  402828,
  407219,
  402789,
  408755,
  402802,
  408756,
  400189,
  400309,
  400417,
  400249,
  401639,
  400662,
  400141,
  400761,
  400490,
  401888,
  400137,
  400716,
  401545,
  401011,
  400674,
  400539,
  400534,
  401062,
  401529,
  401613,
  400536,
  400488,
  401561,
  400611,
  400928,
  400284,
  400041,
  408133,
  408135,
  417665,
  412637,
  417666,
  408134,
  400685,
  401003,
  400898,
  400275,
  400939,
  400180,
  400529,
  400990,
  400515,
  400252,
  400788,
  401517,
  401871,
  400574,
  401629,
  400422,
  400333,
  410363,
  400360,
  400955,
  400495,
  400608,
  400949,
  400678,
  400341,
  400607,
  400094,
  400682,
  408138,
  400980,
  401333,
  404746,
  401142,
  400218,
  400983,
  400765,
  4008

In [43]:
best_alphas_df = pd.DataFrame(best_alphas)
best_alphas_df.to_csv(base_path + 'smaller_sample/best_alphas.csv', index=False)

## Using the tuned alphas to predict training traffic variables

In [44]:
# initialization
# initialize prediction dictionary
pred_dict_train = dict()
for var_name in var_names:
    pred_dict_train[var_name] = []

for i, station in enumerate(stations):
    print("{} Start time series prediction (DES) at station {}...".format(fraction_msg(i+1, len(stations)), station))
    df_train_station = raw_train.loc[raw_train["Station ID"] == station]
    
    # formulate predictions of speed, flow and occupancy for the station
    for var_name in var_names:
        print("    {}...".format(var_name))
        var_series = df_train_station[var_name].values
        len_series = len(var_series)
        # initialize s1, s2, and y
        s1 = np.mean(var_series[:10])
        s2 = s1
        y = [0.] * len_series
        # get the best alpha
        var_best_alpha = best_alphas_df.loc[best_alphas_df["Station ID"] == station][var_name].values[0]
        beta = 1. - var_best_alpha

        for t in range(11, len_series - 1):
            s1 = var_best_alpha * var_series[t] + beta * s1
            s2 = var_best_alpha * s1 + beta * s2
            y[t+1] = round(2 * s1 - s2 + var_best_alpha / beta * (s1 - s2), 2)

        # save the predictions to a dictionary
        pred_dict_train[var_name].extend(y)
    print("End prediction at station {}.".format(station))

[1/102] Start time series prediction (DES) at station 408907...
    Speed...
    Flow...
    Occupancy...
End prediction at station 408907.
[2/102] Start time series prediction (DES) at station 400951...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400951.
[3/102] Start time series prediction (DES) at station 400057...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400057.
[4/102] Start time series prediction (DES) at station 400147...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400147.
[5/102] Start time series prediction (DES) at station 400343...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400343.
[6/102] Start time series prediction (DES) at station 401560...
    Speed...
    Flow...
    Occupancy...
End prediction at station 401560.
[7/102] Start time series prediction (DES) at station 400045...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400045.
[8/102] Start time s

End prediction at station 412637.
[61/102] Start time series prediction (DES) at station 417666...
    Speed...
    Flow...
    Occupancy...
End prediction at station 417666.
[62/102] Start time series prediction (DES) at station 408134...
    Speed...
    Flow...
    Occupancy...
End prediction at station 408134.
[63/102] Start time series prediction (DES) at station 400685...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400685.
[64/102] Start time series prediction (DES) at station 401003...
    Speed...
    Flow...
    Occupancy...
End prediction at station 401003.
[65/102] Start time series prediction (DES) at station 400898...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400898.
[66/102] Start time series prediction (DES) at station 400275...
    Speed...
    Flow...
    Occupancy...
End prediction at station 400275.
[67/102] Start time series prediction (DES) at station 400939...
    Speed...
    Flow...
    Occupancy...
End predictio

In [45]:
raw_train = raw_train.assign(Pred_Speed=pred_dict_train['Speed'], Pred_Flow=pred_dict_train['Flow'], Pred_Occupancy=pred_dict_train['Occupancy'])

In [46]:
raw_train['Diff_Speed'] = raw_train['Speed'] - raw_train['Pred_Speed']
raw_train['Diff_Flow'] = raw_train['Flow'] - raw_train['Pred_Flow']
raw_train['Diff_Occupancy'] = raw_train['Occupancy'] - raw_train['Pred_Occupancy']

## Using the tuned alphas to predict testing traffic variables

In [47]:
# initialization
# initialize prediction dictionary
pred_dict_test = dict()
for var_name in var_names:
    pred_dict_test[var_name] = []

for i, station in enumerate(stations):
    print("{} Start time series prediction (DES) at station {}...".format(fraction_msg(i+1, len(stations)), station))
    df_test_station = raw_test.loc[raw_test["Station ID"] == station]
    
    # formulate predictions of speed, flow and occupancy for the station
    for var_name in var_names:
        print("    {}...".format(var_name))
        var_series = df_test_station[var_name].values
        len_series = len(var_series)
        # initialize s1, s2, and y
        s1 = np.mean(var_series[:10])
        s2 = s1
        y = [0.] * len_series
        # get the best alpha
        var_best_alpha = best_alphas_df.loc[best_alphas_df["Station ID"] == station][var_name].values[0]
        beta = 1. - var_best_alpha

        num_batches = int(len_series / 288)
        for j in range(num_batches):
            base_idx = 288 * j
            for t in range(base_idx + 11, base_idx + 287):
                s1 = var_best_alpha * var_series[t] + beta * s1
                s2 = var_best_alpha * s1 + beta * s2
                y[t+1] = round(2 * s1 - s2 + var_best_alpha / beta * (s1 - s2), 2)

        # save the predictions to a dictionary
        pred_dict_test[var_name].extend(y)
    print("Finished forecasting at station {}.".format(station))
print("Finished forecasting for the test dataset.")

[1/102] Start time series prediction (DES) at station 408907...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 408907.
[2/102] Start time series prediction (DES) at station 400951...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400951.
[3/102] Start time series prediction (DES) at station 400057...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400057.
[4/102] Start time series prediction (DES) at station 400147...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400147.
[5/102] Start time series prediction (DES) at station 400343...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400343.
[6/102] Start time series prediction (DES) at station 401560...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 401560.
[7/102] Start time series prediction (DES) at station 400045...
    Speed...
    Flow...
    Occupancy...
Finished forecasti

Finished forecasting at station 417665.
[60/102] Start time series prediction (DES) at station 412637...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 412637.
[61/102] Start time series prediction (DES) at station 417666...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 417666.
[62/102] Start time series prediction (DES) at station 408134...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 408134.
[63/102] Start time series prediction (DES) at station 400685...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400685.
[64/102] Start time series prediction (DES) at station 401003...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 401003.
[65/102] Start time series prediction (DES) at station 400898...
    Speed...
    Flow...
    Occupancy...
Finished forecasting at station 400898.
[66/102] Start time series prediction (DES) at station 400275...
    Speed...


In [48]:
raw_test = raw_test.assign(Pred_Speed=pred_dict_test['Speed'], Pred_Flow=pred_dict_test['Flow'], Pred_Occupancy=pred_dict_test['Occupancy'])

In [49]:
raw_test.drop(columns=['idx'], inplace=True)

In [50]:
raw_test['Diff_Speed'] = raw_test['Speed'] - raw_test['Pred_Speed']
raw_test['Diff_Flow'] = raw_test['Flow'] - raw_test['Pred_Flow']
raw_test['Diff_Occupancy'] = raw_test['Occupancy'] - raw_test['Pred_Occupancy']

Before training/testing, we want to normalize data for each station.  That is,
1. Using **training** data, compute the ***mean*** and ***standard deviation*** for occ, speed, flow, and their predictions at each station;
2. Compute the z-scores as the new occ, speed, flow and their time series predictions;
3. Keep the means and standard deviations.  After training SVM, compute z-scores using the means and sds from training data for the testing data.  Then perform prediction.

Why does it make sense to compute z-scores?  Checkout `data_analysis/descriptive_statistics.ipynb` for more details.

In [51]:
raw_train.head(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,LambdaMax,Sigma,Tau,Impact,Pred_Speed,Pred_Flow,Pred_Occupancy,Diff_Speed,Diff_Flow,Diff_Occupancy
8640,408907,2017-06-01 00:00:00,0.5,22.0,69.5,06/01/2017,00:00,0.0,0.0,0.0,0.021267,0.0,0.0,0.0,69.5,22.0,0.5
8641,408907,2017-06-01 00:05:00,0.5,22.0,69.1,06/01/2017,00:05,0.0,0.0,0.0,0.017058,0.0,0.0,0.0,69.1,22.0,0.5
8642,408907,2017-06-01 00:10:00,0.5,23.0,68.9,06/01/2017,00:10,0.0,0.0,0.0,0.015338,0.0,0.0,0.0,68.9,23.0,0.5


In [52]:
mean_sd_dict = dict()
pred_var_names = []
diff_var_names = []
for var in var_names:
    mean_sd_dict[var + "_mean"] = []
    mean_sd_dict[var + "_sd"] = []
    pred_var_names.append("Pred_" + var)
    diff_var_names.append("Diff_" + var)
for var in pred_var_names:
    mean_sd_dict[var + "_mean"] = []
    mean_sd_dict[var + "_sd"] = []
for var in diff_var_names:
    mean_sd_dict[var + "_mean"] = []
    mean_sd_dict[var + "_sd"] = []
mean_sd_dict['station'] = stations

for i, s in enumerate(stations):
    s_df = raw_train[raw_train['Station ID'] == s]
    for j, var in enumerate(var_names):
        var_values = s_df[var].values
        mean_sd_dict[var + "_mean"].append(np.mean(var_values))
        mean_sd_dict[var + "_sd"].append(np.std(var_values))
    for j, var in enumerate(pred_var_names):
        var_values = s_df[var].values
        mean_sd_dict[var + "_mean"].append(np.mean(var_values))
        mean_sd_dict[var + "_sd"].append(np.std(var_values))
    for j, var in enumerate(diff_var_names):
        var_values = s_df[var].values
        mean_sd_dict[var + "_mean"].append(np.mean(var_values))
        mean_sd_dict[var + "_sd"].append(np.std(var_values))

In [53]:
mean_sd_dict.keys()

dict_keys(['Speed_mean', 'Speed_sd', 'Flow_mean', 'Flow_sd', 'Occupancy_mean', 'Occupancy_sd', 'Pred_Speed_mean', 'Pred_Speed_sd', 'Pred_Flow_mean', 'Pred_Flow_sd', 'Pred_Occupancy_mean', 'Pred_Occupancy_sd', 'Diff_Speed_mean', 'Diff_Speed_sd', 'Diff_Flow_mean', 'Diff_Flow_sd', 'Diff_Occupancy_mean', 'Diff_Occupancy_sd', 'station'])

In [54]:
stations_train = raw_train['Station ID'].values

In [55]:
station_idx_dict = dict()
for i, s in enumerate(stations):
    station_idx_dict[s] = i

In [56]:
columns = ['Speed', 'Flow', 'Occupancy', 
           'Pred_Speed', 'Pred_Flow', 'Pred_Occupancy', 
           'Diff_Speed', 'Diff_Flow', 'Diff_Occupancy']

In [57]:
z_dict = dict()
for col in columns:
    z_scores = []
    col_train = raw_train[col].values
    for i, val in enumerate(col_train):
        sid = stations_train[i]
        sidx = station_idx_dict[sid]
        z_score = (val - mean_sd_dict[col+'_mean'][sidx]) / mean_sd_dict[col+'_sd'][sidx]
        z_scores.append(z_score)
    z_dict[col] = z_scores
    print("Finished z-score computation: {}.".format(col))

Finished z-score computation: Speed.
Finished z-score computation: Flow.
Finished z-score computation: Occupancy.
Finished z-score computation: Pred_Speed.
Finished z-score computation: Pred_Flow.
Finished z-score computation: Pred_Occupancy.
Finished z-score computation: Diff_Speed.
Finished z-score computation: Diff_Flow.
Finished z-score computation: Diff_Occupancy.


In [58]:
for key in z_dict.keys():
    raw_train[key] = z_dict[key]

In [59]:
raw_train.head(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,LambdaMax,Sigma,Tau,Impact,Pred_Speed,Pred_Flow,Pred_Occupancy,Diff_Speed,Diff_Flow,Diff_Occupancy
8640,408907,2017-06-01 00:00:00,-1.008772,-1.404213,0.633301,06/01/2017,00:00,0.0,0.0,0.0,0.021267,-10.449204,-1.613481,-1.065,18.058879,0.99093,0.323992
8641,408907,2017-06-01 00:05:00,-1.008772,-1.404213,0.551932,06/01/2017,00:05,0.0,0.0,0.0,0.017058,-10.449204,-1.613481,-1.065,17.954691,0.99093,0.323992
8642,408907,2017-06-01 00:10:00,-1.008772,-1.393785,0.511248,06/01/2017,00:10,0.0,0.0,0.0,0.015338,-10.449204,-1.613481,-1.065,17.902598,1.036034,0.323992


In [60]:
stations_test = raw_test['Station ID'].values
z_dict_test = dict()
for col in columns:
    z_scores = []
    col_test = raw_test[col].values
    for i, val in enumerate(col_test):
        sid = stations_test[i]
        sidx = station_idx_dict[sid]
        z_score = (val - mean_sd_dict[col+'_mean'][sidx]) / mean_sd_dict[col+'_sd'][sidx]
        z_scores.append(z_score)
    z_dict_test[col] = z_scores
    print("Finished z-score computation: {}.".format(col))

Finished z-score computation: Speed.
Finished z-score computation: Flow.
Finished z-score computation: Occupancy.
Finished z-score computation: Pred_Speed.
Finished z-score computation: Pred_Flow.
Finished z-score computation: Pred_Occupancy.
Finished z-score computation: Diff_Speed.
Finished z-score computation: Diff_Flow.
Finished z-score computation: Diff_Occupancy.


In [61]:
for key in z_dict.keys():
    raw_test[key] = z_dict_test[key]

In [62]:
raw_test.head(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,LambdaMax,Sigma,Tau,Impact,Pred_Speed,Pred_Flow,Pred_Occupancy,Diff_Speed,Diff_Flow,Diff_Occupancy
9792,408907,2017-06-05 00:00:00,-0.980169,-1.341644,0.511248,06/05/2017,00:00,0.0,0.0,0.0,0.012451,-10.449204,-1.613481,-1.065,17.902598,1.261557,0.388933
9793,408907,2017-06-05 00:05:00,-1.008772,-1.393785,0.185775,06/05/2017,00:05,0.0,0.0,0.0,0.009436,-10.449204,-1.613481,-1.065,17.485848,1.036034,0.323992
9794,408907,2017-06-05 00:10:00,-0.980169,-1.341644,0.450222,06/05/2017,00:10,0.0,0.0,0.0,0.010917,-10.449204,-1.613481,-1.065,17.824457,1.261557,0.388933


# Train: SVM

Note that we need to scale train and test dataset with the same factors.

## Train: feature vectors

### Train: feature vectors - negative

In [63]:
neg_times = raw_train['Time'].unique().tolist()[14:]

In [64]:
neg_sample_dates = dates_train

In [65]:
svm_incidents_sample = svm_pos_timestamps_train.loc[svm_pos_timestamps_train['Date'].isin(neg_sample_dates)]

In [66]:
X_neg_weekday_train = []
X_neg_weekend_train = []
num_segments = len(road_segments)
count_date = 0
for neg_sample_date in neg_sample_dates:
    count_date += 1
    print("{} Negative feature vectors at date {}:".format(fraction_msg(count_date, len(neg_sample_dates)), neg_sample_date))
    
    for i, seg in enumerate(road_segments):
        B, E = seg
        df_neg_train_BE = raw_train.loc[((raw_train["Station ID"] == B) | (raw_train["Station ID"] == E)) & (raw_train["Date"] == neg_sample_date)]
        svm_incidents_sample_BE = svm_incidents_sample.loc[svm_incidents_sample['Upstream'] == B]
        sample_neg_times = np.random.choice(neg_times, 24)
        
        if (i+1) % 20 == 0:
            print("    {} Start constructing feature vectors for road segment s_{},{}...".format(fraction_msg(i+1, num_segments), B, E))
            print("        Total number of vectors: {}".format(len(sample_neg_times)))
        
        for neg_t in sample_neg_times:
            # check if current time is incident time
            if len(svm_incidents_sample_BE.loc[svm_incidents_sample_BE['Time'] == neg_t].index) != 0:
                continue

            feature_t = []
            neg_dt_timestamp = pd.Timestamp(neg_sample_date + ' ' + neg_t + ':00')

            B_lags = []
            for j in range(5):
                B_lags.append(neg_dt_timestamp - dt.timedelta(minutes=j*5))
            B_lags = list(map(lambda x: x.strftime('%H:%M') , B_lags))
            E_lags = B_lags[0:3]

            # upstream features
            for t_lag in B_lags:
                df_dt_lag = df_neg_train_BE.loc[(df_neg_train_BE["Station ID"] == B) & (df_neg_train_BE["Time"] == t_lag)]

                speed_B_t = df_dt_lag["Speed"].values[0]
                flow_B_t = df_dt_lag["Flow"].values[0]
                occ_B_t = df_dt_lag["Occupancy"].values[0]

                speed_pred_B_t = df_dt_lag["Pred_Speed"].values[0]
                flow_pred_B_t = df_dt_lag["Pred_Flow"].values[0]
                occ_pred_B_t = df_dt_lag["Pred_Occupancy"].values[0]
                
                speed_diff_B_t = df_dt_lag["Diff_Speed"].values[0]
                flow_diff_B_t = df_dt_lag["Diff_Flow"].values[0]
                occ_diff_B_t = df_dt_lag["Diff_Occupancy"].values[0]
                
                lambda_max_B_t = df_dt_lag["LambdaMax"].values[0]
                sigma_B_t = df_dt_lag["Sigma"].values[0]
                tau_B_t = df_dt_lag["Tau"].values[0]
                impact_B_t = df_dt_lag["Impact"].values[0]

                feature_t.extend([speed_B_t, flow_B_t, occ_B_t, 
                                  speed_pred_B_t, flow_pred_B_t, occ_pred_B_t, 
                                  speed_diff_B_t, flow_diff_B_t, occ_diff_B_t, 
                                  lambda_max_B_t, sigma_B_t, tau_B_t, impact_B_t])

            # downstream features
            for t_lag in E_lags:
                df_dt_lag = df_neg_train_BE.loc[(df_neg_train_BE["Station ID"] == E) & (df_neg_train_BE["Time"] == t_lag)]

                speed_E_t = df_dt_lag["Speed"].values[0]
                flow_E_t = df_dt_lag["Flow"].values[0]
                occ_E_t = df_dt_lag["Occupancy"].values[0]

                speed_pred_E_t = df_dt_lag["Pred_Speed"].values[0]
                flow_pred_E_t = df_dt_lag["Pred_Flow"].values[0]
                occ_pred_E_t = df_dt_lag["Pred_Occupancy"].values[0]

                speed_diff_E_t = df_dt_lag["Diff_Speed"].values[0]
                flow_diff_E_t = df_dt_lag["Diff_Flow"].values[0]
                occ_diff_E_t = df_dt_lag["Diff_Occupancy"].values[0]
                
                lambda_max_E_t = df_dt_lag["LambdaMax"].values[0]
                sigma_E_t = df_dt_lag["Sigma"].values[0]
                tau_E_t = df_dt_lag["Tau"].values[0]
                impact_E_t = df_dt_lag["Impact"].values[0]

                feature_t.extend([speed_E_t, flow_E_t, occ_E_t, 
                                  speed_pred_E_t, flow_pred_E_t, occ_pred_E_t, 
                                  speed_diff_E_t, flow_diff_E_t, occ_diff_E_t, 
                                  lambda_max_E_t, sigma_E_t, tau_E_t, impact_E_t])
            
            if date_to_day(neg_sample_date) >= 5:
                X_neg_weekend_train.append(feature_t)
            else:
                X_neg_weekday_train.append(feature_t)

        if (i+1) % 20 == 0:
            print("        Feature vector at date and time {} {} is done.".format(neg_sample_date, neg_t))
            print("    ...Completed construction of feature vectors for road segment s_{},{}.".format(B, E))

[1/17] Negative feature vectors at date 06/01/2017:
    [20/101] Start constructing feature vectors for road segment s_402290,402292...
        Total number of vectors: 24
        Feature vector at date and time 06/01/2017 01:10 is done.
    ...Completed construction of feature vectors for road segment s_402290,402292.
    [40/101] Start constructing feature vectors for road segment s_400137,400716...
        Total number of vectors: 24
        Feature vector at date and time 06/01/2017 23:45 is done.
    ...Completed construction of feature vectors for road segment s_400137,400716.
    [60/101] Start constructing feature vectors for road segment s_412637,417666...
        Total number of vectors: 24
        Feature vector at date and time 06/01/2017 18:40 is done.
    ...Completed construction of feature vectors for road segment s_412637,417666.
    [80/101] Start constructing feature vectors for road segment s_410363,400360...
        Total number of vectors: 24
        Feature vecto

        Feature vector at date and time 06/08/2017 12:05 is done.
    ...Completed construction of feature vectors for road segment s_400923,401143.
[7/17] Negative feature vectors at date 06/09/2017:
    [20/101] Start constructing feature vectors for road segment s_402290,402292...
        Total number of vectors: 24
        Feature vector at date and time 06/09/2017 07:35 is done.
    ...Completed construction of feature vectors for road segment s_402290,402292.
    [40/101] Start constructing feature vectors for road segment s_400137,400716...
        Total number of vectors: 24
        Feature vector at date and time 06/09/2017 05:15 is done.
    ...Completed construction of feature vectors for road segment s_400137,400716.
    [60/101] Start constructing feature vectors for road segment s_412637,417666...
        Total number of vectors: 24
        Feature vector at date and time 06/09/2017 08:50 is done.
    ...Completed construction of feature vectors for road segment s_412637,

        Feature vector at date and time 06/21/2017 08:45 is done.
    ...Completed construction of feature vectors for road segment s_400923,401143.
[13/17] Negative feature vectors at date 06/22/2017:
    [20/101] Start constructing feature vectors for road segment s_402290,402292...
        Total number of vectors: 24
        Feature vector at date and time 06/22/2017 21:55 is done.
    ...Completed construction of feature vectors for road segment s_402290,402292.
    [40/101] Start constructing feature vectors for road segment s_400137,400716...
        Total number of vectors: 24
        Feature vector at date and time 06/22/2017 09:40 is done.
    ...Completed construction of feature vectors for road segment s_400137,400716.
    [60/101] Start constructing feature vectors for road segment s_412637,417666...
        Total number of vectors: 24
        Feature vector at date and time 06/22/2017 10:00 is done.
    ...Completed construction of feature vectors for road segment s_412637

In [67]:
len(X_neg_weekday_train), len(X_neg_weekend_train)

(31716, 6788)

In [68]:
y_neg_weekday_train = [-1] * len(X_neg_weekday_train)
y_neg_weekend_train = [-1] * len(X_neg_weekend_train)

### Train: feature vectors - positive

In [None]:
working_time = raw_train['Time'].unique().tolist()[14:]

In [None]:
svm_pos_timestamps_train = svm_pos_timestamps_train.loc[svm_pos_timestamps_train['Time'].isin(working_time)]

In [None]:
X_pos_weekday_train = []
X_pos_weekend_train = []
for i, seg in enumerate(road_segments):
    B, E = seg
    print("{} Start constructing positive feature vectors for road segment s_{},{}... ".format(fraction_msg(i+1, len(road_segments)), B, E))
    progress_count = 0
    
    # construct segment-specific pos_times
    pos_times = []
    df_seg_incidents = svm_pos_timestamps_train.loc[svm_pos_timestamps_train["Upstream"] == B]
    seg_dates = df_seg_incidents['Date'].values.tolist()
    seg_times = df_seg_incidents['Time'].values.tolist()
    num_seg_instances = len(seg_dates)
    for i in range(num_seg_instances):
        pos_times.append(tuple([seg_dates[i], seg_times[i]]))
    
    # select the relevant training data for segment B, E 
    df_train_BE = raw_train.loc[(raw_train["Station ID"] == B) | (raw_train["Station ID"] == E)]
    
    
    print("    Total number of vectors: {}".format(num_seg_instances))
    for pos_dt in pos_times:
        pos_d, pos_t = pos_dt
        feature_t = []
        pos_dt_timestamp = pd.Timestamp(pos_d + ' ' + pos_t + ':00')

        # upstream and downstream time lags
        B_lags = []
        for j in range(5):
            B_lags.append(pos_dt_timestamp - dt.timedelta(minutes=j*5))
        B_lags = list(map(lambda x: (x.strftime('%m/%d/%Y'), x.strftime('%H:%M')) , B_lags))
        E_lags = B_lags[0:3]

        # upstream features
        for dt_lag in B_lags:
            d_lag, t_lag = dt_lag
            df_dt_lag = df_train_BE.loc[(df_train_BE["Station ID"] == B) & (df_train_BE["Date"] == d_lag) & (df_train_BE["Time"] == t_lag)]
            if df_dt_lag.empty:
                print(d_lag, t_lag)
            
            speed_B_t = df_dt_lag["Speed"].values[0]
            flow_B_t = df_dt_lag["Flow"].values[0]
            occ_B_t = df_dt_lag["Occupancy"].values[0]

            speed_pred_B_t = df_dt_lag["Pred_Speed"].values[0]
            flow_pred_B_t = df_dt_lag["Pred_Flow"].values[0]
            occ_pred_B_t = df_dt_lag["Pred_Occupancy"].values[0]
            
            speed_diff_B_t = df_dt_lag["Diff_Speed"].values[0]
            flow_diff_B_t = df_dt_lag["Diff_Flow"].values[0]
            occ_diff_B_t = df_dt_lag["Diff_Occupancy"].values[0]
            
            lambda_max_B_t = df_dt_lag["LambdaMax"].values[0]
            sigma_B_t = df_dt_lag["Sigma"].values[0]
            tau_B_t = df_dt_lag["Tau"].values[0]
            impact_B_t = df_dt_lag["Impact"].values[0]

            feature_t.extend([speed_B_t, flow_B_t, occ_B_t, 
                              speed_pred_B_t, flow_pred_B_t, occ_pred_B_t, 
                              speed_diff_B_t, flow_diff_B_t, occ_diff_B_t, 
                              lambda_max_B_t, sigma_B_t, tau_B_t, impact_B_t])

        # downstream features
        for dt_lag in E_lags:
            d_lag, t_lag = dt_lag
            df_dt_lag = df_train_BE.loc[(df_train_BE["Station ID"] == E) & (df_train_BE["Date"] == d_lag) & (df_train_BE["Time"] == t_lag)]

            speed_E_t = df_dt_lag["Speed"].values[0]
            flow_E_t = df_dt_lag["Flow"].values[0]
            occ_E_t = df_dt_lag["Occupancy"].values[0]

            speed_pred_E_t = df_dt_lag["Pred_Speed"].values[0]
            flow_pred_E_t = df_dt_lag["Pred_Flow"].values[0]
            occ_pred_E_t = df_dt_lag["Pred_Occupancy"].values[0]
            
            speed_diff_E_t = df_dt_lag["Diff_Speed"].values[0]
            flow_diff_E_t = df_dt_lag["Diff_Flow"].values[0]
            occ_diff_E_t = df_dt_lag["Diff_Occupancy"].values[0]
            
            lambda_max_E_t = df_dt_lag["LambdaMax"].values[0]
            sigma_E_t = df_dt_lag["Sigma"].values[0]
            tau_E_t = df_dt_lag["Tau"].values[0]
            impact_E_t = df_dt_lag["Impact"].values[0]
            
            feature_t.extend([speed_E_t, flow_E_t, occ_E_t, 
                              speed_pred_E_t, flow_pred_E_t, occ_pred_E_t, 
                              speed_diff_E_t, flow_diff_E_t, occ_diff_E_t, 
                              lambda_max_E_t, sigma_E_t, tau_E_t, impact_E_t])
        
        if date_to_day(pos_d) >= 5:
            X_pos_weekend_train.append(feature_t)
        else:
            X_pos_weekday_train.append(feature_t)
        
        progress_count += 1
        if progress_count % 100 == 0:
            print("    {} Feature vector at date and time {} {} is done.".format(fraction_msg(progress_count, num_seg_instances), pos_d, pos_t))

print("...Completed construction of feature vectors for road segment s_{},{}.".format(B, E))

[1/101] Start constructing positive feature vectors for road segment s_408907,400951... 
    Total number of vectors: 0
[2/101] Start constructing positive feature vectors for road segment s_400951,400057... 
    Total number of vectors: 43
[3/101] Start constructing positive feature vectors for road segment s_400057,400147... 
    Total number of vectors: 22
[4/101] Start constructing positive feature vectors for road segment s_400147,400343... 
    Total number of vectors: 21
[5/101] Start constructing positive feature vectors for road segment s_400343,401560... 
    Total number of vectors: 12
[6/101] Start constructing positive feature vectors for road segment s_401560,400045... 
    Total number of vectors: 14
[7/101] Start constructing positive feature vectors for road segment s_400045,400122... 
    Total number of vectors: 0
[8/101] Start constructing positive feature vectors for road segment s_400122,401541... 
    Total number of vectors: 5
[9/101] Start constructing positive

[72/101] Start constructing positive feature vectors for road segment s_400252,400788... 
    Total number of vectors: 64
[73/101] Start constructing positive feature vectors for road segment s_400788,401517... 
    Total number of vectors: 56
[74/101] Start constructing positive feature vectors for road segment s_401517,401871... 
    Total number of vectors: 0
[75/101] Start constructing positive feature vectors for road segment s_401871,400574... 
    Total number of vectors: 22
[76/101] Start constructing positive feature vectors for road segment s_400574,401629... 
    Total number of vectors: 24
[77/101] Start constructing positive feature vectors for road segment s_401629,400422... 
    Total number of vectors: 64
[78/101] Start constructing positive feature vectors for road segment s_400422,400333... 
    Total number of vectors: 0
[79/101] Start constructing positive feature vectors for road segment s_400333,410363... 
    Total number of vectors: 7
[80/101] Start constructing

In [None]:
len(X_pos_weekday_train), len(X_pos_weekend_train)

(1644, 253)

In [None]:
y_pos_weekday_train = [1] * len(X_pos_weekday_train)
y_pos_weekend_train = [1] * len(X_pos_weekend_train)

## Train: Merging feature vectors together

### Weekday

In [None]:
X_neg_weekday_train = np.array(X_neg_weekday_train)
X_neg_weekend_train = np.array(X_neg_weekend_train)

In [None]:
X_neg_weekday_train_balanced = X_neg_weekday_train[np.random.choice(len(X_neg_weekday_train), len(X_pos_weekday_train), replace=False)].tolist()

In [None]:
y_neg_weekday_train_balanced = [-1] * len(X_neg_weekday_train_balanced)

In [None]:
X_train_weekday = X_neg_weekday_train_balanced + X_pos_weekday_train
#X_train_weekday_unbalanced = np.concatenate((X_neg_weekday_train, X_pos_weekday_train))

In [None]:
y_train_weekday = y_neg_weekday_train_balanced + y_pos_weekday_train
#y_train_weekday_unbalanced = y_neg_weekday_train + y_pos_weekday_train

In [None]:
#len(X_train_weekday_unbalanced), len(y_train_weekday_unbalanced)

In [None]:
len(X_train_weekday), len(y_train_weekday)

(3288, 3288)

In [None]:
len(X_train_weekday[0])

104

### Weekend

In [None]:
X_neg_weekend_train_balanced = X_neg_weekend_train[np.random.choice(len(X_neg_weekend_train), len(X_pos_weekend_train), replace=False)].tolist()

In [None]:
y_neg_weekend_train_balanced = [-1] * len(X_neg_weekend_train_balanced)

In [None]:
X_train_weekend = X_neg_weekend_train_balanced + X_pos_weekend_train
#X_train_weekend_unbalanced = np.concatenate((X_neg_weekend_train, X_pos_weekend_train))

In [None]:
y_train_weekend = y_neg_weekend_train_balanced + y_pos_weekend_train
#y_train_weekend_unbalanced = y_neg_weekend_train + y_pos_weekend_train

In [None]:
#len(X_train_weekend_unbalanced), len(y_train_weekend_unbalanced)

In [None]:
len(X_train_weekend), len(y_train_weekend)

(506, 506)

## Test: feature vectors

In [None]:
raw_test.tail(3)

Unnamed: 0,Station ID,datetime,Occupancy,Flow,Speed,Date,Time,LambdaMax,Sigma,Tau,Impact,Pred_Speed,Pred_Flow,Pred_Occupancy,Diff_Speed,Diff_Flow,Diff_Occupancy
6532125,401471,2017-06-28 23:45:00,-1.149112,-1.207491,0.885074,06/28/2017,23:45,0.038882,0.0,0.0,0.06484,0.295204,-1.181611,-1.153521,-0.025416,-0.300943,0.187732
6532126,401471,2017-06-28 23:50:00,-1.217922,-1.207491,0.790361,06/28/2017,23:50,0.038882,0.0,0.0,0.064408,0.315483,-1.220174,-1.160351,-0.076551,0.317241,-0.712596
6532127,401471,2017-06-28 23:55:00,-1.217922,-1.283552,0.695648,06/28/2017,23:55,0.038882,0.0,0.0,0.063334,0.300998,-1.225277,-1.214997,-0.091591,-0.813065,0.087695


In [None]:
feature_names = ['Speed', 'Flow', 'Occupancy', 
                 'Pred_Speed', 'Pred_Flow', 'Pred_Occupancy', 
                 'Diff_Speed', 'Diff_Flow', 'Diff_Occupancy']

In [None]:
feature_names.extend(['LambdaMax', 'Sigma', 'Tau', 'Impact'])

In [None]:
num_features = len(feature_names)
k_B = 4
k_E = 2

In [None]:
X_test_weekday, X_test_weekend = [], []
y_test_weekday, y_test_weekend = [], []
for seg_idx, seg in enumerate(road_segments):
    B, E = seg
    print("{} Constructing feature vector for segment s_{},{}...".format(fraction_msg(seg_idx+1, len(road_segments)), B, E))
    df_BE_test = raw_test.loc[((raw_test["Station ID"] == B) | (raw_test["Station ID"] == E))]
    df_incidents_BE_test = svm_pos_timestamps_test.loc[svm_pos_timestamps_test["Upstream"] == B]
    incidents_BE_date = df_incidents_BE_test["Date"].values
    incidents_BE_time = df_incidents_BE_test["Time"].values
    
    incidents_BE_dt = set()
    for i in range(len(incidents_BE_date)):
        incidents_BE_dt.add(incidents_BE_date[i] + ' ' + incidents_BE_time[i])
    
    # change to access by indices, to make program faster
    features_BE_dict = dict()
    features_BE_dict[B] = dict()
    features_BE_dict[E] = dict()
    for feature_name in feature_names:
        features_BE_dict[B][feature_name] = df_BE_test.loc[df_BE_test["Station ID"] == B][feature_name].values.tolist()
        features_BE_dict[E][feature_name] = df_BE_test.loc[df_BE_test["Station ID"] == E][feature_name].values.tolist()
    
    total_count = len(dates_test) * len(neg_times)
    count = 0
    print("    Total number of instances: {}".format(total_count)) 
    
    for i, d in enumerate(dates_test):
        for j, t in enumerate(neg_times):
            # construct vector Z(s_BE, dt)
            feature_BE_t = [0.] * (k_B + k_E + 2) * num_features
            base_idx = i * 288 + 14 + j
            for k, feature_name in enumerate(feature_names):
                # feature_k_B_t: [t-4, t-3, ..., t] -> need to be reversed and made consistent with order of SVM features. Same to E.
                feature_k_B_t = features_BE_dict[B][feature_name][base_idx-k_B:base_idx+1]
                feature_k_E_t = features_BE_dict[E][feature_name][base_idx-k_E:base_idx+1]
                feature_k_B_t.reverse()
                feature_k_E_t.reverse()
                feature_k_BE_t = feature_k_B_t + feature_k_E_t
                feature_BE_t[k:(k_B + k_E + 2)*num_features:num_features] = feature_k_BE_t
            
            if date_to_day(d) >= 5:
                X_test_weekend.append(feature_BE_t)
                # label data
                if d + ' ' + t in incidents_BE_dt:
                    y_test_weekend.append(1)
                else:
                    y_test_weekend.append(-1)
            else:
                X_test_weekday.append(feature_BE_t)
                # label data
                if d + ' ' + t in incidents_BE_dt:
                    y_test_weekday.append(1)
                else:
                    y_test_weekday.append(-1)
        count += 1
        if count % 50 == 0:
            print("    Progress: {}" + fraction_msg(count * len(neg_times), total_count))
    print("...Finished construction for segment s_{},{}.".format(B, E))

[1/101] Constructing feature vector for segment s_408907,400951...
    Total number of instances: 3288
...Finished construction for segment s_408907,400951.
[2/101] Constructing feature vector for segment s_400951,400057...
    Total number of instances: 3288
...Finished construction for segment s_400951,400057.
[3/101] Constructing feature vector for segment s_400057,400147...
    Total number of instances: 3288
...Finished construction for segment s_400057,400147.
[4/101] Constructing feature vector for segment s_400147,400343...
    Total number of instances: 3288
...Finished construction for segment s_400147,400343.
[5/101] Constructing feature vector for segment s_400343,401560...
    Total number of instances: 3288
...Finished construction for segment s_400343,401560.
[6/101] Constructing feature vector for segment s_401560,400045...
    Total number of instances: 3288
...Finished construction for segment s_401560,400045.
[7/101] Constructing feature vector for segment s_400045,4

    Total number of instances: 3288
...Finished construction for segment s_400284,400041.
[56/101] Constructing feature vector for segment s_400041,408133...
    Total number of instances: 3288
...Finished construction for segment s_400041,408133.
[57/101] Constructing feature vector for segment s_408133,408135...
    Total number of instances: 3288
...Finished construction for segment s_408133,408135.
[58/101] Constructing feature vector for segment s_408135,417665...
    Total number of instances: 3288
...Finished construction for segment s_408135,417665.
[59/101] Constructing feature vector for segment s_417665,412637...
    Total number of instances: 3288
...Finished construction for segment s_417665,412637.
[60/101] Constructing feature vector for segment s_412637,417666...
    Total number of instances: 3288
...Finished construction for segment s_412637,417666.
[61/101] Constructing feature vector for segment s_417666,408134...
    Total number of instances: 3288
...Finished cons

In [None]:
len(X_test_weekday), len(X_test_weekend), len(y_test_weekday), len(y_test_weekend)

(193718, 138370, 193718, 138370)

## SVM preprocessing: merge train/test and normalize

In [None]:
from sklearn import preprocessing

### Weekday

In [None]:
#X_weekday_normalized = preprocessing.scale(X_train_weekday + X_test_weekday)
X_weekday_normalized = X_train_weekday + X_test_weekday
#X_train_weekday_normalized_unbalanced = preprocessing.scale(X_train_weekday_unbalanced)
#X_test_weekday_normalized_unbalanced = preprocessing.scale(X_test_weekday)

In [None]:
len(X_train_weekday), len(X_test_weekday)

(3288, 193718)

In [None]:
X_weekday_normalized_df = pd.DataFrame(X_train_weekday + X_test_weekday)

In [None]:
X_train_weekday_normalized = X_weekday_normalized[:len(X_train_weekday)]
X_test_weekday_normalized = X_weekday_normalized[len(X_train_weekday):]
#X_train_weekday_normalized_unbalanced = X_weekday_normalized_unbalanced[:len(X_train_weekday_unbalanced)]
#X_test_weekday_normalized_unbalanced = X_weekday_normalized_unbalanced[len(X_train_weekday_unbalanced):]

In [None]:
X_train_weekday_normalized_df = pd.DataFrame(X_train_weekday_normalized)
X_test_weekday_normalized_df = pd.DataFrame(X_test_weekday_normalized)

### Weekend

In [None]:
#X_weekend_normalized = preprocessing.scale(X_train_weekend + X_test_weekend)
X_weekend_normalized = X_train_weekend + X_test_weekend
#X_train_weekend_normalized_unbalanced = preprocessing.scale(X_train_weekend_unbalanced)
#X_test_weekend_normalized_unbalanced = preprocessing.scale(X_test_weekend)

In [None]:
X_train_weekend_normalized = X_weekend_normalized[:len(X_train_weekend)]
X_test_weekend_normalized = X_weekend_normalized[len(X_train_weekend):]
#X_train_weekend_normalized_unbalanced = X_weekend_normalized_unbalanced[:len(X_train_weekend_unbalanced)]
#X_test_weekend_normalized_unbalanced = X_weekend_normalized_unbalanced[len(X_train_weekend_unbalanced):]

In [None]:
X_train_weekend_normalized_df = pd.DataFrame(X_train_weekend_normalized)
X_test_weekend_normalized_df = pd.DataFrame(X_test_weekend_normalized)

## SVM training

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
param_grid = {
    'C': [10 ** i for i in range(-4, 3)],
    'gamma': [10 ** i for i in range(-4, 4, 2)]
}

### Weekday

In [None]:
svm_grid_search_weekday = GridSearchCV(SVC(kernel='rbf'), n_jobs=8, 
                               param_grid=param_grid, cv=5, 
                               scoring='accuracy', verbose=5)

In [None]:
svm_grid_search_weekday.fit(X_train_weekday_normalized, y_train_weekday)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   11.2s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:  3.2min
[Parallel(n_jobs=8)]: Done 140 out of 140 | elapsed:  8.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.0001, 0.01, 1, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=5)

In [None]:
svm_grid_search_weekday.best_params_

{'C': 0.1, 'gamma': 0.01}

In [None]:
svm_grid_search_weekday.best_score_

0.6201338199513382

In [None]:
len(svm_grid_search_weekday.best_estimator_.support_vectors_)

2696

In [None]:
svm_grid_search_weekday.best_estimator_

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
y_train_weekday_pred = svm_grid_search_weekday.predict(X_train_weekday_normalized)

In [None]:
accuracy_score(y_train_weekday_pred, y_train_weekday)

0.6362530413625304

In [None]:
len(X_test_weekday_normalized)

193718

In [None]:
# parallelize prediction
def predict_func(X, clf):
    print("Process {} is predicting ...".format(mp.current_process().pid))
    return clf.predict(X)

In [None]:
pred_pool = mp.Pool(8)
num_instances = int(np.ceil(len(X_test_weekday_normalized) / 8))
y_test_weekday_pred_jobs = [pred_pool.apply_async(predict_func, args=(X_test_weekday_normalized[i*num_instances:(i+1)*num_instances], svm_grid_search_weekday)) for i in range(0, 8)]
pred_pool.close()
pred_pool.join()

Process 30167 is predicting ...
Process 30168 is predicting ...
Process 30169 is predicting ...
Process 30170 is predicting ...
Process 30171 is predicting ...
Process 30172 is predicting ...
Process 30173 is predicting ...
Process 30174 is predicting ...


In [None]:
y_test_weekday_pred = []
for y_test_weekday_pred_job in y_test_weekday_pred_jobs:
    y_test_weekday_pred.extend(y_test_weekday_pred_job.get())

In [None]:
len(X_test_weekday_normalized), len(y_test_weekday_pred)

(193718, 193718)

In [None]:
accuracy_score(y_test_weekday_pred, y_test_weekday)

0.8336292961934358

In [None]:
num_dt_segments_weekday = int(len(y_test_weekday_pred) / (288-14))

### Weekend

In [None]:
svm_grid_search_weekend = GridSearchCV(SVC(kernel='rbf'), n_jobs=8, 
                               param_grid=param_grid, cv=5, 
                               scoring='accuracy', verbose=2)

In [None]:
svm_grid_search_weekend.fit(X_train_weekend_normalized, y_train_weekend)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   13.0s
[Parallel(n_jobs=8)]: Done 140 out of 140 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.0001, 0.01, 1, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=2)

In [None]:
svm_grid_search_weekday.best_params_, svm_grid_search_weekday.best_score_

({'C': 0.1, 'gamma': 0.01}, 0.6201338199513382)

In [None]:
len(svm_grid_search_weekend.best_estimator_.support_vectors_)

390

In [None]:
svm_grid_search_weekend.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
y_train_weekend_pred = svm_grid_search_weekend.predict(X_train_weekend_normalized)

In [None]:
accuracy_score(y_train_weekend_pred, y_train_weekend)

0.733201581027668

In [None]:
len(X_test_weekend_normalized)

138370

In [None]:
pred_pool = mp.Pool(8)
num_instances = int(np.ceil(len(X_test_weekend_normalized) / 8))
y_test_weekend_pred_jobs = [pred_pool.apply_async(predict_func, args=(X_test_weekend_normalized[i*num_instances:(i+1)*num_instances], svm_grid_search_weekend)) for i in range(0, 8)]
pred_pool.close()
pred_pool.join()

Process 30205 is predicting ...
Process 30206 is predicting ...
Process 30207 is predicting ...
Process 30208 is predicting ...
Process 30209 is predicting ...
Process 30210 is predicting ...
Process 30211 is predicting ...
Process 30212 is predicting ...


In [None]:
y_test_weekend_pred = []
for y_test_weekend_pred_job in y_test_weekend_pred_jobs:
    y_test_weekend_pred.extend(y_test_weekend_pred_job.get())

In [None]:
len(X_test_weekend_normalized), len(y_test_weekend_pred)

(138370, 138370)

In [None]:
accuracy_score(y_test_weekend_pred, y_test_weekend)

0.7040904820409049

In [None]:
num_dt_segments_weekend = int(len(y_test_weekend_pred) / (288-14))

## Decision Tree Learning

In [None]:
from sklearn.tree import DecisionTreeClassifier

### Weekday

In [None]:
clf_gini_weekday = DecisionTreeClassifier(criterion='gini', random_state=None)

In [None]:
clf_gini_weekday.fit(X_train_weekday_normalized, y_train_weekday)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [None]:
y_train_weekday_pred_gini = clf_gini_weekday.predict(X_train_weekday_normalized)

In [None]:
accuracy_score(y_train_weekday_pred_gini, y_train_weekday)

1.0

In [None]:
y_test_weekday_pred_gini = clf_gini_weekday.predict(X_test_weekday_normalized)

In [None]:
accuracy_score(y_test_weekday_pred_gini, y_test_weekday)

0.6066550346379789

### Weekend

In [None]:
clf_gini_weekend = DecisionTreeClassifier(criterion='gini', random_state=None)

In [None]:
clf_gini_weekend.fit(X_train_weekend_normalized, y_train_weekend)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [None]:
y_train_weekend_pred_gini = clf_gini_weekend.predict(X_train_weekend_normalized)

In [None]:
accuracy_score(y_train_weekend_pred_gini, y_train_weekend)

1.0

In [None]:
y_test_weekend_pred_gini = clf_gini_weekend.predict(X_test_weekend_normalized)

In [None]:
accuracy_score(y_test_weekend_pred_gini, y_test_weekend)

0.6823733468237335

## Knn Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

### Weekday

In [None]:
for i in range(1, 10, 2):
    knnClf = KNeighborsClassifier(n_neighbors=i)
    knnClf.fit(X_train_weekday_normalized,y_train_weekday)
    y_test_weekday_pred_knn = knnClf.predict(X_test_weekday_normalized)
    print(accuracy_score(y_test_weekday_pred_knn, y_test_weekday))
    print(confusion_matrix(y_test_weekday_pred_knn, y_test_weekday))

0.6202159840593027
[[119658    469]
 [ 73102    489]]
0.6423099557088139
[[123959    490]
 [ 68801    468]]
0.6610846694679896
[[127607    501]
 [ 65153    457]]
0.6743875117438751
[[130198    515]
 [ 62562    443]]
0.685253822566824
[[132321    533]
 [ 60439    425]]


In [None]:
knnClf = KNeighborsClassifier(n_neighbors=3, n_jobs=8)
knnClf.fit(X_train_weekday_normalized,y_train_weekday)
y_test_weekday_pred_knn = knnClf.predict(X_test_weekday_normalized)

In [None]:
'''for i in range(1,10):
    knnClf = KNeighborsClassifier(n_neighbors=i,n_jobs=8)
    knnClf.fit(X_train_weekday_normalized_unbalanced,y_train_weekday_unbalanced)
    y_test_weekday_pred_knn = knnClf.predict(X_test_weekday_normalized)
    print(accuracy_score(y_test_weekday_pred_knn, y_test_weekday))
    print(confusion_matrix(y_test_weekday_pred_knn, y_test_weekday))'''

'for i in range(1,10):\n    knnClf = KNeighborsClassifier(n_neighbors=i,n_jobs=8)\n    knnClf.fit(X_train_weekday_normalized_unbalanced,y_train_weekday_unbalanced)\n    y_test_weekday_pred_knn = knnClf.predict(X_test_weekday_normalized)\n    print(accuracy_score(y_test_weekday_pred_knn, y_test_weekday))\n    print(confusion_matrix(y_test_weekday_pred_knn, y_test_weekday))'

### Weekend

In [None]:
for i in range(1, 10, 2):
    knnClf = KNeighborsClassifier(n_neighbors=i,n_jobs=8)
    knnClf.fit(X_train_weekend_normalized,y_train_weekend)
    y_test_weekend_pred_knn = knnClf.predict(X_test_weekend_normalized)
    print(accuracy_score(y_test_weekend_pred_knn, y_test_weekend))
    print(confusion_matrix(y_test_weekend_pred_knn, y_test_weekend))

0.6496061284960613
[[89691   269]
 [48215   195]]
0.6556912625569127
[[90538   274]
 [47368   190]]
0.6558430295584303
[[90560   275]
 [47346   189]]
0.6557346245573462
[[90542   272]
 [47364   192]]
0.6575196935751969
[[90797   280]
 [47109   184]]


In [None]:
knnClf = KNeighborsClassifier(n_neighbors=3,n_jobs=8)
knnClf.fit(X_train_weekend_normalized,y_train_weekend)
y_test_weekend_pred_knn = knnClf.predict(X_test_weekend_normalized)

In [None]:
'''for i in range(1,10):
    knnClf = KNeighborsClassifier(n_neighbors=i,n_jobs=8)
    knnClf.fit(X_train_weekend_normalized_unbalanced,y_train_weekend_unbalanced)
    y_test_weekend_pred_knn = knnClf.predict(X_test_weekend_normalized)
    print(accuracy_score(y_test_weekend_pred_knn, y_test_weekend))
    print(confusion_matrix(y_test_weekend_pred_knn, y_test_weekend))'''

'for i in range(1,10):\n    knnClf = KNeighborsClassifier(n_neighbors=i,n_jobs=8)\n    knnClf.fit(X_train_weekend_normalized_unbalanced,y_train_weekend_unbalanced)\n    y_test_weekend_pred_knn = knnClf.predict(X_test_weekend_normalized)\n    print(accuracy_score(y_test_weekend_pred_knn, y_test_weekend))\n    print(confusion_matrix(y_test_weekend_pred_knn, y_test_weekend))'

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Weekday

In [None]:
((sum(y_test_weekday)+len(y_test_weekday))/2)/((sum(y_test_weekday)-len(y_test_weekday))/2)

-0.004969910769869267

In [None]:
for n in range(1, 100, 5):
    for d in [2 ** i for i in range(1, 3)]:
        print("{}, {}:".format(n, d))
        clf_rf_weekday = RandomForestClassifier(n_estimators=n, max_depth=d,random_state=0)
        clf_rf_weekday.fit(X_train_weekday_normalized, y_train_weekday)
        y_test_weekday_pred_rf = clf_rf_weekday.predict(X_test_weekday_normalized)
        print("{}".format(accuracy_score(y_test_weekday_pred_rf,y_test_weekday)))
        print("{}".format(confusion_matrix(y_test_weekday_pred_rf,y_test_weekday)))

1, 2:
0.8816010902445823
[[170493    669]
 [ 22267    289]]
1, 4:
0.8604517907473751
[[166361    634]
 [ 26399    324]]
6, 2:
0.8016704694452761
[[154916    576]
 [ 37844    382]]
6, 4:
0.7810270599531277
[[150878    537]
 [ 41882    421]]
11, 2:
0.8193817817652463
[[158364    593]
 [ 34396    365]]
11, 4:
0.7921514779215147
[[153056    560]
 [ 39704    398]]
16, 2:
0.8224274460814173
[[158960    599]
 [ 33800    359]]
16, 4:
0.802439628738682
[[155056    567]
 [ 37704    391]]
21, 2:
0.8241825746703971
[[159304    603]
 [ 33456    355]]
21, 4:
0.8090574959477178
[[156348    577]
 [ 36412    381]]
26, 2:
0.8279767497083389
[[160043    607]
 [ 32717    351]]
26, 4:
0.8128465088427508
[[157089    584]
 [ 35671    374]]
31, 2:
0.8329117583291176
[[161002    610]
 [ 31758    348]]
31, 4:
0.8166768188810539
[[157833    586]
 [ 34927    372]]
36, 2:
0.8379345233793453
[[161978    613]
 [ 30782    345]]
36, 4:
0.8198205639124914
[[158437    581]
 [ 34323    377]]
41, 2:
0.8414396184143962
[[1

In [None]:
clf_rf_weekday = RandomForestClassifier(n_estimators=96)
clf_rf_weekday.fit(X_train_weekday_normalized, y_train_weekday)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=96, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
y_test_weekday_pred_rf = clf_rf_weekday.predict(X_test_weekday_normalized)

In [None]:
accuracy_score(y_test_weekday, y_test_weekday_pred_rf)

0.7846715328467153

### Weekend

In [None]:
for n in range(10, 100, 5):
    for d in [2 ** i for i in range(1, 3)]:
        print("{}, {}:".format(n, d))
        clf_rf_weekend = RandomForestClassifier(n_estimators=n, max_depth=d,random_state=0)
        clf_rf_weekend.fit(X_train_weekend_normalized, y_train_weekend)
        y_test_weekend_pred_rf = clf_rf_weekend.predict(X_test_weekend_normalized)
        print(accuracy_score(y_test_weekend_pred_rf,y_test_weekend))
        print(confusion_matrix(y_test_weekend_pred_rf,y_test_weekend))

10, 2:
0.7817879598178796
[[108079    367]
 [ 29827     97]]
10, 4:
0.7586615595866156
[[104864    352]
 [ 33042    112]]
15, 2:
0.751232203512322
[[103837    353]
 [ 34069    111]]
15, 4:
0.7705066127050662
[[106501    350]
 [ 31405    114]]
20, 2:
0.7721543687215436
[[106726    347]
 [ 31180    117]]
20, 4:
0.7767796487677965
[[107368    349]
 [ 30538    115]]
25, 2:
0.7848883428488834
[[108506    365]
 [ 29400     99]]
25, 4:
0.7968201199682012
[[110159    367]
 [ 27747     97]]
30, 2:
0.8079352460793524
[[111710    380]
 [ 26196     84]]
30, 4:
0.8021825540218256
[[110904    370]
 [ 27002     94]]
35, 2:
0.826096697260967
[[114231    388]
 [ 23675     76]]
35, 4:
0.8146635831466358
[[112639    378]
 [ 25267     86]]
40, 2:
0.8236901062369011
[[113899    389]
 [ 24007     75]]
40, 4:
0.8182698561826985
[[113142    382]
 [ 24764     82]]
45, 2:
0.8305557563055576
[[114855    395]
 [ 23051     69]]
45, 4:
0.8249909662499096
[[114070    380]
 [ 23836     84]]
50, 2:
0.8288718652887187


In [None]:
clf_rf_weekend = RandomForestClassifier(n_estimators=20)
clf_rf_weekend.fit(X_test_weekend_normalized, y_test_weekend)

In [None]:
y_test_weekend_pred_rf = clf_rf_weekend.predict(X_test_weekend_normalized)

In [None]:
accuracy_score(y_test_weekend, y_test_weekend_pred_rf)

## MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier

### Weekday

In [None]:
activations = ['identity', 'logistic', 'tanh', 'relu']
for a in activations:
    clf_mlp_weekday = MLPClassifier(learning_rate_init=0.001, max_iter=300, activation=a)
    clf_mlp_weekday.fit(X_train_weekday_normalized, y_train_weekday)
    y_test_weekday_pred_mlp = clf_mlp_weekday.predict(X_test_weekday_normalized)
    print("Activation method:" + a)
    print(accuracy_score(y_test_weekday_pred_mlp, y_test_weekday))
    print(confusion_matrix(y_test_weekday_pred_mlp, y_test_weekday))

In [None]:
clf_mlp_weekday = MLPClassifier(learning_rate_init=0.001, max_iter=500, activation='tanh')
clf_mlp_weekday.fit(X_train_weekday_normalized, y_train_weekday)

In [None]:
y_test_weekday_pred_mlp = clf_mlp_weekday.predict(X_test_weekday_normalized)

In [None]:
accuracy_score(y_test_weekday, y_test_weekday_pred_mlp)

### Weekend

In [None]:
activations = ['identity', 'logistic', 'tanh', 'relu']
for a in activations:
    clf_mlp_weekend = MLPClassifier(learning_rate_init=0.001, max_iter=300, activation=a)
    clf_mlp_weekend.fit(X_train_weekend_normalized, y_train_weekend)
    y_test_weekend_pred_mlp = clf_mlp_weekend.predict(X_test_weekend_normalized)
    print("Activation method:"+a)
    print(accuracy_score(y_test_weekend_pred_mlp,y_test_weekend))
    print(confusion_matrix(y_test_weekend_pred_mlp,y_test_weekend))

In [None]:
clf_mlp_weekend = MLPClassifier(learning_rate_init=0.001, max_iter=1000, activation='tanh')
clf_mlp_weekend.fit(X_train_weekend_normalized, y_train_weekend)
y_test_weekend_pred_mlp = clf_mlp_weekend.predict(X_test_weekend_normalized)
print("Activation method:"+'tanh')
print(accuracy_score(y_test_weekend_pred_mlp,y_test_weekend))
print(confusion_matrix(y_test_weekend_pred_mlp,y_test_weekend))

## Adaboost 

### Weekday

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ns = range(1,100,5)
for i in ns:
    clf_ab_weekday = AdaBoostClassifier(n_estimators=i)
    clf_ab_weekday.fit(X_train_weekday_normalized, y_train_weekday)
    y_test_weekday_pred_ab = clf_ab_weekday.predict(X_test_weekday_normalized)
    print("N estimators: "+ str(i))
    print(accuracy_score(y_test_weekday_pred_ab,y_test_weekday))
    print(confusion_matrix(y_test_weekday_pred_ab,y_test_weekday))

In [None]:
clf_ab_weekday = AdaBoostClassifier(n_estimators=90)
clf_ab_weekday.fit(X_train_weekday_normalized, y_train_weekday)
y_test_weekday_pred_ab = clf_ab_weekday.predict(X_test_weekday_normalized)
print("N estimators: "+ str(90))
print(accuracy_score(y_test_weekday_pred_ab,y_test_weekday))
print(confusion_matrix(y_test_weekday_pred_ab,y_test_weekday))

### Weekend

In [None]:
ns = range(1,100,5)
for i in ns:
    clf_ab_weekend = AdaBoostClassifier(n_estimators=i)
    clf_ab_weekend.fit(X_train_weekend_normalized, y_train_weekend)
    y_test_weekend_pred_ab = clf_ab_weekend.predict(X_test_weekend_normalized)
    print("N estimators: "+ str(i))
    print(accuracy_score(y_test_weekend_pred_ab,y_test_weekend))
    print(confusion_matrix(y_test_weekend_pred_ab,y_test_weekend))

In [None]:
clf_ab_weekend = AdaBoostClassifier(n_estimators=90)
clf_ab_weekend.fit(X_train_weekend_normalized, y_train_weekend)
y_test_weekend_pred_ab = clf_ab_weekend.predict(X_test_weekend_normalized)
print("N estimators: "+ str(90))
print(accuracy_score(y_test_weekend_pred_ab,y_test_weekend))
print(confusion_matrix(y_test_weekend_pred_ab,y_test_weekend))

## Gradient Boosting Decision Tree

### Weekday

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
ns = range(1,100,5)
for i in ns:
    clf_gb_weekday = GradientBoostingClassifier(n_estimators=i)
    clf_gb_weekday.fit(X_train_weekday_normalized, y_train_weekday)
    y_test_weekday_pred_gb = clf_gb_weekday.predict(X_test_weekday_normalized)
    print("N estimators: "+ str(i))
    print(accuracy_score(y_test_weekday_pred_gb,y_test_weekday))
    print(confusion_matrix(y_test_weekday_pred_gb,y_test_weekday))

In [None]:
clf_gb_weekday = GradientBoostingClassifier(n_estimators=90)
clf_gb_weekday.fit(X_train_weekday_normalized, y_train_weekday)
y_test_weekday_pred_gb = clf_gb_weekday.predict(X_test_weekday_normalized)
print("N estimators: "+ str(90))
print(accuracy_score(y_test_weekday_pred_gb,y_test_weekday))
print(confusion_matrix(y_test_weekday_pred_gb,y_test_weekday))

### Weekend

In [None]:
ns = range(1,100,5)
for i in ns:
    clf_gb_weekend = GradientBoostingClassifier(n_estimators=i)
    clf_gb_weekend.fit(X_train_weekend_normalized, y_train_weekend)
    y_test_weekend_pred_gb = clf_gb_weekend.predict(X_test_weekend_normalized)
    print("N estimators: "+ str(i))
    print(accuracy_score(y_test_weekend_pred_gb,y_test_weekend))
    print(confusion_matrix(y_test_weekend_pred_gb,y_test_weekend))

In [None]:
clf_gb_weekend = GradientBoostingClassifier(n_estimators=50)
clf_gb_weekend.fit(X_train_weekend_normalized, y_train_weekend)
y_test_weekend_pred_gb = clf_gb_weekend.predict(X_test_weekend_normalized)
print("N estimators: "+ str(50))
print(accuracy_score(y_test_weekend_pred_gb,y_test_weekend))
print(confusion_matrix(y_test_weekend_pred_gb,y_test_weekend))

# Evaluation Metrics

Rush hours: 6:30 AM - 9:00 AM (idx 78 - idx 108), 3:30 PM - 6:30 PM (idx 186 - idx 222).

In [None]:
rush_hours_idx = list(range(78, 109)) + list(range(186, 223))

In [None]:
PT_thresholds = [2 ** i for i in range(5)]

In [None]:
results_str = ""

## Detection rate (DR)

In [None]:
def DR(y_test, y_test_pred, PT_thresholds, num_dt_segments):
    DRs = []
    for PT_threshold in PT_thresholds:
        num_detected_incidents = 0
        total_num_incidents = 0
        for i in range(num_dt_segments):
            base_idx = i * 274
            max_base_offset = (i + 1) * 274
            start_idx = base_idx
            end_idx = start_idx
            while start_idx < max_base_offset:
                while start_idx < max_base_offset and y_test[start_idx] == -1:
                    start_idx += 1
                if start_idx == max_base_offset:
                    break
                # an incident happens
                # time span of the incident
                end_idx = start_idx
                while end_idx < max_base_offset and y_test[end_idx] == 1:
                    end_idx += 1
                if (end_idx - start_idx >= PT_threshold):
                    total_num_incidents += 1
                    # the incident is detected
                    if 1 in y_test_pred[start_idx:end_idx]:
                        num_detected_incidents += 1

                start_idx = end_idx + 1
        DRs.append(round(num_detected_incidents / total_num_incidents * 100, 2))
        print("PT_{}".format(PT_threshold))
        print("# of detected incidents: {}".format(num_detected_incidents))
        print("Total # of incidents: {}".format(total_num_incidents))
        print("Detection rate: {}".format(DRs[-1]))
    return DRs

### SVM

In [None]:
y_test = y_test_weekday + y_test_weekend

In [None]:
# glue predictions together
y_test_pred_svm = y_test_weekday_pred + y_test_weekend_pred

In [None]:
confusion_matrix(y_test_pred_svm, y_test)

In [None]:
DRs = DR(y_test, y_test_pred_svm, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "SVM DR: {}\n".format(DRs[0])

### Decision Tree with Gini Index

In [None]:
# glue predictions together
y_test_pred_gini = list(y_test_weekday_pred_gini) + list(y_test_weekend_pred_gini)

In [None]:
DRs_gini = DR(y_test, y_test_pred_gini, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "DT DR: {}\n".format(DRs_gini[0])

### Knn Classifier

In [None]:
y_test_pred_knn = list(y_test_weekday_pred_knn) + list(y_test_weekend_pred_knn)
DRs_knn = DR(y_test, y_test_pred_knn, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "KNN DR: {}\n".format(DRs_knn[0])

### Random Forest

In [None]:
y_test_pred_rf = list(y_test_weekday_pred_rf) + list(y_test_weekend_pred_rf)
DRs_rf = DR(y_test, y_test_pred_rf, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "RF DR: {}\n".format(DRs_rf[0])

### MLP

In [None]:
y_test_pred_mlp = list(y_test_weekday_pred_mlp) + list(y_test_weekend_pred_mlp)
DRs_mlp = DR(y_test, y_test_pred_mlp, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "MLP DR: {}\n".format(DRs_mlp[0])

### AdaBoost

In [None]:
y_test_pred_ab = list(y_test_weekday_pred_ab) + list(y_test_weekend_pred_ab)
DRs_ab = DR(y_test, y_test_pred_ab, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "AB DR: {}\n".format(DRs_ab[0])

### Gradient Boost

In [None]:
y_test_pred_gb = list(y_test_weekday_pred_gb) + list(y_test_weekend_pred_gb)
DRs_gb = DR(y_test, y_test_pred_gb, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "GB DR: {}\n".format(DRs_gb[0])

## Mean time to detect (MTTD)

In [None]:
def MTTD(y_test, y_test_pred, PT_thresholds, num_dt_segments):
    MTTDs = []
    for PT_threshold in PT_thresholds:
        h = 0
        sum_ttd = 0
        for i in range(num_dt_segments):
            base_idx = i * 274
            max_base_offset = (i + 1) * 274
            start_idx = base_idx
            end_idx = start_idx
            while start_idx < max_base_offset:
                while start_idx < max_base_offset and y_test[start_idx] == -1:
                    start_idx += 1
                if start_idx == max_base_offset:
                    break
                # an incident happens
                # time span of the incident
                end_idx = start_idx
                while end_idx < max_base_offset and y_test[end_idx] == 1:
                    end_idx += 1

                if end_idx - start_idx >= PT_threshold:
                    # the incident is detected
                    if 1 in y_test_pred[start_idx:end_idx]:
                        h += 1
                        incident_idx = start_idx
                        detection_idx = incident_idx
                        while y_test_pred[detection_idx] == -1:
                            detection_idx += 1
                        sum_ttd += (detection_idx - incident_idx) * 5

                start_idx = end_idx + 1
        MTTDs.append(round(sum_ttd / h, 2))

        print("PT_{}".format(PT_threshold))
        print("Total number of detected incidents: {}".format(h))
        print("Total time of detection lags (min): {}".format(sum_ttd))
        print("Mean time to detect (MTTD): {}".format(MTTDs[-1]))
    return MTTDs

### SVM

In [None]:
MTTDs = MTTD(y_test, y_test_pred_svm, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "SVM MTTD: {}\n".format(MTTDs[0])

### Decision Tree with Gini Index

In [None]:
MTTDs_gini = MTTD(y_test, y_test_pred_gini, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "DT MTTD: {}\n".format(MTTDs_gini[0])

### Knn Classifier

In [None]:
MTTDs_knn = MTTD(y_test, y_test_pred_knn, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "KNN MTTD: {}\n".format(MTTDs_knn[0])

### Random Forest

In [None]:
MTTDs_rf = MTTD(y_test, y_test_pred_rf, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "RF MTTD: {}\n".format(MTTDs_rf[0])

### MLP

In [None]:
MTTDs_mlp = MTTD(y_test, y_test_pred_mlp, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "MLP MTTD: {}\n".format(MTTDs_mlp[0])

### AdaBoost

In [None]:
MTTDs_ab = MTTD(y_test, y_test_pred_ab, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "AB MTTD: {}\n".format(MTTDs_ab[0])

### Gradient Boost

In [None]:
MTTDs_gb = MTTD(y_test, y_test_pred_gb, PT_thresholds[0:3], num_dt_segments_weekday + num_dt_segments_weekend)
results_str += "GB MTTD: {}\n".format(MTTDs_gb[0])

## False positive rate

$$ \frac{FP}{N} = \frac{FP}{FP + TN}, $$ where $FP$ = False positive, and $TN$ = True negative.

In [None]:
def FP(y_test, y_test_pred):
    num_false_positives = 0
    num_true_negatives = 0
    total_num_detections = len(y_test_pred)
    for i in range(total_num_detections):
        if y_test_pred[i] == 1 and y_test[i] == -1:
            num_false_positives += 1
        elif y_test_pred[i] == -1 and y_test[i] == -1:
            num_true_negatives += 1
    print("The false positive rate is {}.".format(round(num_false_positives / (num_false_positives + num_true_negatives), 4)))

In [None]:
FP(y_test, y_test_pred_svm)

In [None]:
FP(y_test, y_test_pred_gini)

## False alarm rate

In [None]:
def false_alarm_rate(y_test, y_test_pred):
    num_false_alarms = 0
    total_num_detections = len(y_test_pred)
    for i in range(total_num_detections):
        if y_test_pred[i] != y_test[i]:
            num_false_alarms += 1
    FAR = round(num_false_alarms / total_num_detections * 100, 4)
    print("The false alarm rate is {}.".format(FAR))
    return FAR

### SVM

In [None]:
FAR = false_alarm_rate(y_test, y_test_pred_svm)
results_str += "SVM FAR: {}\n".format(FAR)

### Decision Tree Gini

In [None]:
FAR_gini = false_alarm_rate(y_test, y_test_pred_gini)
results_str += "DT FAR: {}\n".format(FAR_gini)

### Knn

In [None]:
FAR_knn = false_alarm_rate(y_test, y_test_pred_knn)
results_str += "KNN FAR: {}\n".format(FAR_knn)

### Random Forest

In [None]:
FAR_rf = false_alarm_rate(y_test, y_test_pred_rf)
results_str += "RF FAR: {}\n".format(FAR_rf)

### MLP 

In [None]:
FAR_mlp = false_alarm_rate(y_test, y_test_pred_mlp)
results_str += "MLP FAR: {}\n".format(FAR_mlp)

### AdaBoost

In [None]:
FAR_ab = false_alarm_rate(y_test, y_test_pred_ab)
results_str += "AB FAR: {}\n".format(FAR_ab)

### Gradient Boost

In [None]:
FAR_gb = false_alarm_rate(y_test, y_test_pred_gb)
results_str += "GB FAR: {}\n".format(FAR_gb)

In [None]:
import os

In [None]:
def save_result(output_path, eval_result):
    flag = "w"
    if os.path.isfile(output_path):
        flag = "a"
    
    with open(output_path, flag) as f:
        f.write("{} \n".format(eval_result))
        f.close()

In [None]:
save_result('./res.txt', results_str)