In [1]:
%config IPCompleter.greedy=True
%config Completer.use_jedi = False

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import datetime as dt

import pickle

from patsy import dmatrices

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz

# Read csv file

In [3]:
routeNum = '270'
df = pd.read_csv(f'lines/{routeNum}/{routeNum}_MODELING.csv')

In [4]:
# for stop in stops_1:
#     print(stop)
#     workingDF = df[df['startStop']==stop]
#     print(workingDF['endStop'].value_counts())

In [5]:
df.dtypes

DIRECTION             int64
MONTH                 int64
WEEKDAY               int64
HOUR                  int64
startStop             int64
endStop             float64
ARR_ACT               int64
DEP_ACT               int64
JOURNEYTIME           int64
temp                float64
humidity              int64
wind_speed          float64
precipitation_1h    float64
dtype: object

### convert month, weekday, startStop, endStop to type category

In [6]:
df[['MONTH', 'WEEKDAY', 'startStop', 'endStop']] =\
df[['MONTH', 'WEEKDAY', 'startStop', 'endStop']].astype('category')

## Make copy of dataframe

In [7]:
df_rev = df

## Split the dataframe up by direction

In [8]:
df_rev_1 = df_rev[df_rev['DIRECTION']==1]

In [9]:
df_rev_2 = df_rev[df_rev['DIRECTION']==2]

### generate a list of stops for each direction

In [10]:
stops_1 = sorted(list(df_rev_1['startStop'].unique()))

In [11]:
stops_2 = sorted(list(df_rev_2['startStop'].unique()))

In [12]:
# stops_2

# Direction 1 first

### For each stop, make a dataframe and split it into test/train split

In [13]:
# Make dictionaries with keys for each stop with each train/test split value as its values.

X_trainDict_1 = {}
y_trainDict_1 = {}
X_testDict_1 ={}
y_testDict_1 = {}

remove = []

for stop in stops_1:
    
    workingDF = df_rev_1[df_rev_1['startStop']==stop]
    
    if workingDF.shape[0] < 3:
        print(f'{stop} too small')
        remove.append(stop)
        continue
    
    # drop unnecessary columns
    workingDF.drop(columns=['startStop', 'endStop', 'DIRECTION', 'DEP_ACT', 'ARR_ACT'], inplace=True)
    
    # y is the target
    y = workingDF["JOURNEYTIME"]
    # X is everything else
    X = workingDF.drop(["JOURNEYTIME"],1)
    # Split the dataset into two datasets: 70% training and 30% test
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,  test_size=0.3)
    
    X_trainDict_1[stop] = X_train
    X_testDict_1[stop] = X_test
    y_trainDict_1[stop] = y_train
    y_testDict_1[stop] = y_test
    
    # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
for i in remove:
    stops_1.remove(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### For each stop, make a linReg model

In [14]:
modelDict_1 = {}

for stop in stops_1:
    print(f'Now modelling stop number {stop}.')
    
    X_train = X_trainDict_1[stop]
    X_test = X_testDict_1[stop]
    y_train = y_trainDict_1[stop]
    y_test = y_testDict_1[stop]
    
    linReg = LinearRegression().fit(X_train, y_train)
    
    modelDict_1[stop] = linReg
    
    endStop = int(df_rev_1[df_rev_1['startStop']==stop]['endStop'].mode())
    
    filename = f'lines/{routeNum}/{routeNum}_models/dir1/FROM_{stop}_TO_{endStop}.sav'
    pickle.dump(linReg, open(filename, 'wb'))

Now modelling stop number 3328.
Now modelling stop number 3329.
Now modelling stop number 3330.
Now modelling stop number 3331.
Now modelling stop number 3332.
Now modelling stop number 3351.
Now modelling stop number 4323.
Now modelling stop number 4324.
Now modelling stop number 4325.
Now modelling stop number 4765.
Now modelling stop number 4766.
Now modelling stop number 4767.
Now modelling stop number 4768.
Now modelling stop number 4769.
Now modelling stop number 4770.
Now modelling stop number 7026.


### Evaluate model on the training data

In [15]:
for stop in stops_1:  
    X_train = X_trainDict_1[stop]
    y_train = list(y_trainDict_1[stop])
    linReg = modelDict_1[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    linReg_predictions_train = list(linReg.predict(X_train))
    with open(f'lines/{routeNum}/{routeNum}_dir1_linReg_trainMetrics.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_train, linReg_predictions_train)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_train, linReg_predictions_train)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_train, linReg_predictions_train)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_train, linReg_predictions_train)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_train, linReg_predictions_train)}')

Now modelling for stop number 3328.
Now modelling for stop number 3329.
Now modelling for stop number 3330.
Now modelling for stop number 3331.
Now modelling for stop number 3332.
Now modelling for stop number 3351.
Now modelling for stop number 4323.
Now modelling for stop number 4324.
Now modelling for stop number 4325.
Now modelling for stop number 4765.
Now modelling for stop number 4766.
Now modelling for stop number 4767.
Now modelling for stop number 4768.
Now modelling for stop number 4769.
Now modelling for stop number 4770.
Now modelling for stop number 7026.


### Evaluate model on the test data

In [16]:
for stop in stops_1:  
    X_test = X_testDict_1[stop]
    y_test = list(y_testDict_1[stop])
    linReg = modelDict_1[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    linReg_predictions_test = list(linReg.predict(X_test))
    
    with open(f'lines/{routeNum}/{routeNum}_dir1_linReg_testMetrics.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_test, linReg_predictions_test)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_test, linReg_predictions_test)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_test, linReg_predictions_test)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_test, linReg_predictions_test)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_test, linReg_predictions_test)}')

Now modelling for stop number 3328.
Now modelling for stop number 3329.
Now modelling for stop number 3330.
Now modelling for stop number 3331.
Now modelling for stop number 3332.
Now modelling for stop number 3351.
Now modelling for stop number 4323.
Now modelling for stop number 4324.
Now modelling for stop number 4325.
Now modelling for stop number 4765.
Now modelling for stop number 4766.
Now modelling for stop number 4767.
Now modelling for stop number 4768.
Now modelling for stop number 4769.
Now modelling for stop number 4770.
Now modelling for stop number 7026.


# Direction 2 next

### For each stop, make a dataframe and split it into test/train split

In [17]:
# Make dictionaries with keys for each stop with each train/test split value as its values.

X_trainDict_2 = {}
y_trainDict_2 = {}
X_testDict_2 ={}
y_testDict_2 = {}

remove = []

for stop in stops_2:
    workingDF = df_rev_2[df_rev_2['startStop']==stop]
    
    # drop unnecessary columns
    workingDF.drop(columns=['startStop', 'endStop', 'DIRECTION', 'DEP_ACT', 'ARR_ACT'], inplace=True)
    
    if workingDF.shape[0] < 3:
        print(f'{stop} too small')
        remove.append(stop)
        continue
    
    else:

        # y is the target
        y = workingDF["JOURNEYTIME"]
        # X is everything else
        X = workingDF.drop(["JOURNEYTIME"],1)
        # Split the dataset into two datasets: 70% training and 30% test
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

        X_trainDict_2[stop] = X_train
        X_testDict_2[stop] = X_test
        y_trainDict_2[stop] = y_train
        y_testDict_2[stop] = y_test

        # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
        X_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)
        
for i in remove:
    stops_2.remove(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### For each stop, make a linReg model

In [18]:
modelDict_2 = {}

for stop in stops_2:
    print(f'Now modelling stop number {stop}.')
    
    X_train = X_trainDict_2[stop]
    X_test = X_testDict_2[stop]
    y_train = y_trainDict_2[stop]
    y_test = y_testDict_2[stop]
    
    linReg = LinearRegression().fit(X_train, y_train)
    
    modelDict_2[stop] = linReg
    
    endStop = int(df_rev_2[df_rev_2['startStop']==stop]['endStop'].mode())
    
    filename = f'lines/{routeNum}/{routeNum}_models/dir2/FROM_{stop}_TO_{endStop}.sav'
    pickle.dump(linReg, open(filename, 'wb'))

Now modelling stop number 3333.
Now modelling stop number 3334.
Now modelling stop number 3335.
Now modelling stop number 3336.
Now modelling stop number 3337.
Now modelling stop number 3338.
Now modelling stop number 3339.
Now modelling stop number 3340.
Now modelling stop number 3352.
Now modelling stop number 4324.
Now modelling stop number 4326.
Now modelling stop number 4765.
Now modelling stop number 4766.
Now modelling stop number 4767.
Now modelling stop number 4768.
Now modelling stop number 4769.
Now modelling stop number 4770.


### Evaluate model on the training data

In [19]:
for stop in stops_2:  
    X_train = X_trainDict_2[stop]
    y_train = list(y_trainDict_2[stop])
    linReg = modelDict_2[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    linReg_predictions_train = list(linReg.predict(X_train))
    with open(f'lines/{routeNum}/{routeNum}_dir2_linReg_trainMetrics.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_train, linReg_predictions_train)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_train, linReg_predictions_train)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_train, linReg_predictions_train)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_train, linReg_predictions_train)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_train, linReg_predictions_train)}')

Now modelling for stop number 3333.
Now modelling for stop number 3334.
Now modelling for stop number 3335.
Now modelling for stop number 3336.
Now modelling for stop number 3337.
Now modelling for stop number 3338.
Now modelling for stop number 3339.
Now modelling for stop number 3340.
Now modelling for stop number 3352.
Now modelling for stop number 4324.
Now modelling for stop number 4326.
Now modelling for stop number 4765.
Now modelling for stop number 4766.
Now modelling for stop number 4767.
Now modelling for stop number 4768.
Now modelling for stop number 4769.
Now modelling for stop number 4770.


### Evaluate model on the test data

In [20]:
for stop in stops_2:  
    X_test = X_testDict_2[stop]
    y_test = list(y_testDict_2[stop])
    linReg = modelDict_2[stop]
    
    print(f'Now modelling for stop number {stop}.')
    
    linReg_predictions_test = list(linReg.predict(X_test))
    
    with open(f'lines/{routeNum}/{routeNum}_dir2_linReg_testMetrics.csv', 'a') as fh:
        fh.write('\n\n=============================================================================='+\
                f'\nMetrics for stop model number {stop}:'
                f'\nMAE: {metrics.mean_absolute_error(y_test, linReg_predictions_test)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_test, linReg_predictions_test)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_test, linReg_predictions_test)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_test, linReg_predictions_test)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_test, linReg_predictions_test)}')

Now modelling for stop number 3333.
Now modelling for stop number 3334.
Now modelling for stop number 3335.
Now modelling for stop number 3336.
Now modelling for stop number 3337.
Now modelling for stop number 3338.
Now modelling for stop number 3339.
Now modelling for stop number 3340.
Now modelling for stop number 3352.
Now modelling for stop number 4324.
Now modelling for stop number 4326.
Now modelling for stop number 4765.
Now modelling for stop number 4766.
Now modelling for stop number 4767.
Now modelling for stop number 4768.
Now modelling for stop number 4769.
Now modelling for stop number 4770.
