In [19]:
%config IPCompleter.greedy=True
%config Completer.use_jedi = False

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import datetime as dt

import pickle

from patsy import dmatrices

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz

# Read csv file

In [21]:
df = pd.read_csv('STOPS_MODELING_1.csv')

In [22]:
df.shape[0]

7158337

In [23]:
df.dtypes

LINEID               object
DIRECTION             int64
MONTH                 int64
WEEKDAY               int64
HOUR                  int64
startStop             int64
endStop             float64
ARR_ACT               int64
DEP_ACT               int64
JOURNEYTIME         float64
temp                float64
humidity              int64
wind_speed          float64
precipitation_1h    float64
dtype: object

### convert month, weekday, startStop, endStop to type category

In [24]:
df[['MONTH', 'WEEKDAY', 'startStop', 'endStop', 'LINEID']] =\
df[['MONTH', 'WEEKDAY', 'startStop', 'endStop', 'LINEID']].astype('category')

## Make copy of dataframe

In [25]:
df_rev = df

In [26]:
combinedDF = df_rev.groupby(['startStop', 'endStop']).size().reset_index().rename(columns={0:'count'})

In [32]:
combinedDF = combinedDF[combinedDF['count']>100000].reset_index(drop=True)

In [33]:
combinedDF

Unnamed: 0,startStop,endStop,count
0,14,15.0,115647
1,15,17.0,111857
2,17,18.0,112049
3,18,19.0,112075
4,19,21.0,111809
5,44,7603.0,116238
6,45,46.0,115936
7,46,47.0,116864
8,47,48.0,116872
9,48,49.0,115824


## Split the dataframe up by direction

In [34]:
df_rev_1 = df_rev[df_rev['DIRECTION']==1]

In [35]:
df_rev_2 = df_rev[df_rev['DIRECTION']==2]

In [58]:
df_rev_2[(df_rev_2['startStop']==14)&(df_rev_2['JOURNEYTIME']>df_rev_2['JOURNEYTIME'].mean())]

Unnamed: 0,LINEID,DIRECTION,MONTH,WEEKDAY,HOUR,startStop,endStop,ARR_ACT,DEP_ACT,JOURNEYTIME,temp,humidity,wind_speed,precipitation_1h
717,40,2,1,0,10,14,15.0,35946,35961,80.0,5.02,81,9.77,0.0
865,16,2,1,0,12,14,15.0,45369,45369,70.0,7.01,76,12.35,0.0
1006,40,2,1,0,12,14,15.0,44147,44156,64.0,7.01,76,12.35,0.0
1432,16,2,1,0,12,14,15.0,43978,43987,67.0,7.01,76,12.35,0.0
1672,11,2,1,0,14,14,15.0,50601,50624,92.0,6.65,81,12.35,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7157233,13,2,12,0,15,14,15.0,58756,58777,101.0,9.55,76,4.60,0.0
7157246,40,2,12,0,15,14,15.0,55051,55051,84.0,9.55,76,4.60,0.0
7157273,16,2,12,0,15,14,15.0,53930,53940,96.0,9.55,76,4.60,0.0
7158036,11,2,12,0,7,14,15.0,29028,29028,64.0,8.95,81,3.10,0.0


### generate a list of stops for each direction

In [36]:
stops_1 = sorted(list(df_rev_1['startStop'].unique()))

In [37]:
stops_2 = sorted(list(df_rev_2['startStop'].unique()))

# Direction 1 first

### For each stop, make a dataframe and split it into test/train split

In [38]:
# Make dictionaries with keys for each stop with each train/test split value as its values.

X_trainDict_1 = {}
y_trainDict_1 = {}
X_testDict_1 ={}
y_testDict_1 = {}

remove = []

for stop in stops_1:
    
    workingDF = df_rev_1[df_rev_1['startStop']==stop]
    
    if workingDF.shape[0] < 3:
        print(f'{stop} too small')
        remove.append(stop)
        continue
    
    # drop unnecessary columns
    workingDF.drop(columns=['startStop', 'endStop', 'DIRECTION', 'DEP_ACT', 'ARR_ACT', 'LINEID'], inplace=True)
    
    # y is the target
    y = workingDF["JOURNEYTIME"]
    # X is everything else
    X = workingDF.drop(["JOURNEYTIME"],1)
    # Split the dataset into two datasets: 70% training and 30% test
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)
    
    X_trainDict_1[stop] = X_train
    X_testDict_1[stop] = X_test
    y_trainDict_1[stop] = y_train
    y_testDict_1[stop] = y_test
    
    # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
for i in remove:
    stops_1.remove(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [39]:
len(list(X_trainDict_1.keys()))

46

### For each stop, make a linReg model

In [59]:
modelDict_1 = {}

for stop in stops_1:
    print(f'Now modelling stop number {stop}.')
    
    X_train = X_trainDict_1[stop]
#     X_test = X_testDict_1[stop]
    y_train = y_trainDict_1[stop]
#     y_test = y_testDict_1[stop]
    
    rfr = RandomForestRegressor(n_estimators=50, max_features='auto', oob_score=True, random_state=1)
    rfr.fit(X_train, y_train)
    
    modelDict_1[stop] = rfr
    
    endStop = int(df_rev_1[df_rev_1['startStop']==stop]['endStop'].mode())
    
    filename = f'randomForestsModels/dir1/FROM_{stop}_TO_{endStop}.sav'
    pickle.dump(rfr, open(filename, 'wb'))

Now modelling stop number 14.
Now modelling stop number 15.
Now modelling stop number 17.
Now modelling stop number 18.
Now modelling stop number 19.
Now modelling stop number 44.
Now modelling stop number 45.
Now modelling stop number 46.
Now modelling stop number 47.
Now modelling stop number 48.
Now modelling stop number 49.
Now modelling stop number 51.
Now modelling stop number 495.
Now modelling stop number 496.
Now modelling stop number 515.
Now modelling stop number 516.
Now modelling stop number 519.
Now modelling stop number 521.
Now modelling stop number 522.
Now modelling stop number 614.
Now modelling stop number 615.
Now modelling stop number 616.
Now modelling stop number 617.
Now modelling stop number 618.
Now modelling stop number 619.
Now modelling stop number 664.
Now modelling stop number 665.
Now modelling stop number 666.
Now modelling stop number 667.
Now modelling stop number 668.
Now modelling stop number 672.
Now modelling stop number 1016.
Now modelling stop 

### Evaluate model on the training data

In [60]:
for stop in stops_1:  
    X_train = X_trainDict_1[stop]
    y_train = list(y_trainDict_1[stop])
    rfr = modelDict_1[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    rfr_predictions_train = list(rfr.predict(X_train))
    with open(f'randomForestsMetrics/randomForest_trainMetrics_dir1.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_train, rfr_predictions_train)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_train, rfr_predictions_train)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_train, rfr_predictions_train)}')

Now modelling for stop number 14.
Now modelling for stop number 15.
Now modelling for stop number 17.
Now modelling for stop number 18.
Now modelling for stop number 19.
Now modelling for stop number 44.
Now modelling for stop number 45.
Now modelling for stop number 46.
Now modelling for stop number 47.
Now modelling for stop number 48.
Now modelling for stop number 49.
Now modelling for stop number 51.
Now modelling for stop number 495.
Now modelling for stop number 496.
Now modelling for stop number 515.
Now modelling for stop number 516.
Now modelling for stop number 519.
Now modelling for stop number 521.
Now modelling for stop number 522.
Now modelling for stop number 614.
Now modelling for stop number 615.
Now modelling for stop number 616.
Now modelling for stop number 617.
Now modelling for stop number 618.
Now modelling for stop number 619.
Now modelling for stop number 664.
Now modelling for stop number 665.
Now modelling for stop number 666.
Now modelling for stop number 66

### Evaluate model on the test data

In [61]:
for stop in stops_1:  
    X_test = X_testDict_1[stop]
    y_test = list(y_testDict_1[stop])
    rfr = modelDict_1[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    rfr_predictions_test = list(rfr.predict(X_test))
    
    with open(f'randomForestsMetrics/randomForest_testMetrics_dir1.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_test, rfr_predictions_test)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_test, rfr_predictions_test)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_test, rfr_predictions_test)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_test, rfr_predictions_test)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_test, rfr_predictions_test)}')

Now modelling for stop number 14.
Now modelling for stop number 15.
Now modelling for stop number 17.
Now modelling for stop number 18.
Now modelling for stop number 19.
Now modelling for stop number 44.
Now modelling for stop number 45.
Now modelling for stop number 46.
Now modelling for stop number 47.
Now modelling for stop number 48.
Now modelling for stop number 49.
Now modelling for stop number 51.
Now modelling for stop number 495.
Now modelling for stop number 496.
Now modelling for stop number 515.
Now modelling for stop number 516.
Now modelling for stop number 519.
Now modelling for stop number 521.
Now modelling for stop number 522.
Now modelling for stop number 614.
Now modelling for stop number 615.
Now modelling for stop number 616.
Now modelling for stop number 617.
Now modelling for stop number 618.
Now modelling for stop number 619.
Now modelling for stop number 664.
Now modelling for stop number 665.
Now modelling for stop number 666.
Now modelling for stop number 66

# Direction 2 next

### For each stop, make a dataframe and split it into test/train split

In [62]:
# Make dictionaries with keys for each stop with each train/test split value as its values.

X_trainDict_2 = {}
y_trainDict_2 = {}
X_testDict_2 ={}
y_testDict_2 = {}

remove = []

for stop in stops_2:
    workingDF = df_rev_2[df_rev_2['startStop']==stop]
    
    # drop unnecessary columns
    workingDF.drop(columns=['startStop', 'endStop', 'DIRECTION', 'DEP_ACT', 'ARR_ACT', 'LINEID'], inplace=True)
    
    if workingDF.shape[0] < 3:
        print(f'{stop} too small')
        remove.append(stop)
        continue
    
    else:

        # y is the target
        y = workingDF["JOURNEYTIME"]
        # X is everything else
        X = workingDF.drop(["JOURNEYTIME"],1)
        # Split the dataset into two datasets: 70% training and 30% test
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

        X_trainDict_2[stop] = X_train
        X_testDict_2[stop] = X_test
        y_trainDict_2[stop] = y_train
        y_testDict_2[stop] = y_test

        # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
        X_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)
        
for i in remove:
    stops_2.remove(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### For each stop, make a linReg model

In [63]:
modelDict_2 = {}

for stop in stops_2:
    print(f'Now modelling stop number {stop}.')
    
    X_train = X_trainDict_2[stop]
#     X_test = X_testDict_2[stop]
    y_train = y_trainDict_2[stop]
#     y_test = y_testDict_2[stop]
    
    rfr = RandomForestRegressor(n_estimators=50, max_features='auto', oob_score=True, random_state=1)
    rfr.fit(X_train, y_train)
    
    modelDict_2[stop] = rfr
    
    endStop = int(df_rev_2[df_rev_2['startStop']==stop]['endStop'].mode())
    
    filename = f'randomForestsModels/dir2/FROM_{stop}_TO_{endStop}.sav'
    pickle.dump(rfr, open(filename, 'wb'))

Now modelling stop number 14.
Now modelling stop number 15.
Now modelling stop number 17.
Now modelling stop number 18.
Now modelling stop number 19.
Now modelling stop number 44.
Now modelling stop number 45.
Now modelling stop number 46.
Now modelling stop number 47.
Now modelling stop number 48.
Now modelling stop number 49.
Now modelling stop number 51.
Now modelling stop number 495.
Now modelling stop number 496.
Now modelling stop number 515.
Now modelling stop number 516.
Now modelling stop number 519.
Now modelling stop number 521.
Now modelling stop number 522.
Now modelling stop number 614.
Now modelling stop number 615.
Now modelling stop number 616.
Now modelling stop number 617.
Now modelling stop number 618.
Now modelling stop number 619.
Now modelling stop number 664.
Now modelling stop number 665.
Now modelling stop number 666.
Now modelling stop number 667.
Now modelling stop number 668.
Now modelling stop number 672.
Now modelling stop number 1069.
Now modelling stop 

### Evaluate model on the training data

In [64]:
for stop in stops_2:  
    X_train = X_trainDict_2[stop]
    y_train = list(y_trainDict_2[stop])
    rfr = modelDict_2[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    rfr_predictions_train = list(rfr.predict(X_train))
    with open(f'randomForestsMetrics/randomForest_trainMetrics_dir2.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_train, rfr_predictions_train)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_train, rfr_predictions_train)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_train, rfr_predictions_train)}')

Now modelling for stop number 14.
Now modelling for stop number 15.
Now modelling for stop number 17.
Now modelling for stop number 18.
Now modelling for stop number 19.
Now modelling for stop number 44.
Now modelling for stop number 45.
Now modelling for stop number 46.
Now modelling for stop number 47.
Now modelling for stop number 48.
Now modelling for stop number 49.
Now modelling for stop number 51.
Now modelling for stop number 495.
Now modelling for stop number 496.
Now modelling for stop number 515.
Now modelling for stop number 516.
Now modelling for stop number 519.
Now modelling for stop number 521.
Now modelling for stop number 522.
Now modelling for stop number 614.
Now modelling for stop number 615.
Now modelling for stop number 616.
Now modelling for stop number 617.
Now modelling for stop number 618.
Now modelling for stop number 619.
Now modelling for stop number 664.
Now modelling for stop number 665.
Now modelling for stop number 666.
Now modelling for stop number 66

### Evaluate model on the test data

In [65]:
for stop in stops_2:  
    X_test = X_testDict_2[stop]
    y_test = list(y_testDict_2[stop])
    rfr = modelDict_2[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    rfr_predictions_test = list(rfr.predict(X_test))
    
    with open(f'randomForestsMetrics/randomForest_testMetrics_dir2.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_test, rfr_predictions_test)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_test, rfr_predictions_test)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_test, rfr_predictions_test)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_test, rfr_predictions_test)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_test, rfr_predictions_test)}')

Now modelling for stop number 14.
Now modelling for stop number 15.
Now modelling for stop number 17.
Now modelling for stop number 18.
Now modelling for stop number 19.
Now modelling for stop number 44.
Now modelling for stop number 45.
Now modelling for stop number 46.
Now modelling for stop number 47.
Now modelling for stop number 48.
Now modelling for stop number 49.
Now modelling for stop number 51.
Now modelling for stop number 495.
Now modelling for stop number 496.
Now modelling for stop number 515.
Now modelling for stop number 516.
Now modelling for stop number 519.
Now modelling for stop number 521.
Now modelling for stop number 522.
Now modelling for stop number 614.
Now modelling for stop number 615.
Now modelling for stop number 616.
Now modelling for stop number 617.
Now modelling for stop number 618.
Now modelling for stop number 619.
Now modelling for stop number 664.
Now modelling for stop number 665.
Now modelling for stop number 666.
Now modelling for stop number 66