In [1]:
import pandas as pd
import datetime as dt
from matplotlib import pyplot as plt

## Read cleaned csv

In [1431]:
routeNum = '270'
df = pd.read_csv(f'lines/{routeNum}/{routeNum}_DQP_cleanedCSV.csv')

### Make sure the features are the correct datatypes

In [1432]:
df

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,ROUTEID,DIRECTION
0,01-JAN-18 00:00:00,5962547,1,7026,72600,72589,72589,270_42,1
1,01-JAN-18 00:00:00,5962547,3,4324,72991,72985,72992,270_42,1
2,01-JAN-18 00:00:00,5962547,4,4769,73010,73031,73039,270_42,1
3,01-JAN-18 00:00:00,5962547,5,4770,73040,73080,73080,270_42,1
4,01-JAN-18 00:00:00,5962547,6,4765,73062,73126,73133,270_42,1
...,...,...,...,...,...,...,...,...,...
195918,31-DEC-18 00:00:00,8578509,8,4767,63714,63719,63719,270_42,1
195919,31-DEC-18 00:00:00,8578509,9,4768,63732,63750,63750,270_42,1
195920,31-DEC-18 00:00:00,8578509,10,4325,63809,63835,63835,270_42,1
195921,31-DEC-18 00:00:00,8578509,11,3328,63892,63909,63933,270_42,1


In [1433]:
df['DAYOFSERVICE'] = pd.to_datetime(df['DAYOFSERVICE'])

In [1434]:
# month
df['MONTH'] = df['DAYOFSERVICE'].dt.month

In [1435]:
# Day of the week
df['WEEKDAY'] = df['DAYOFSERVICE'].dt.weekday

In [1436]:
# hour
df['HOUR'] = (df['ARR/DEP_PLAN']//60)//60

### Sort dataframe

In [1437]:
df.sort_values(by=['DAYOFSERVICE', 'TRIPID', 'PROGRNUMBER'], inplace=True, ignore_index=True)

## remove all non-complete journeys
- first find what unique PROGRNUMBER each routeID has for each direction

In [1438]:
# get a list of unique routeids
routeIDs = list(sorted(df['ROUTEID'].unique()))

In [1439]:
# for each routeid, make it a dictionary key
# for each key, add the list of all unique stops for that routeid in the dataframe
routeIDs_dict = {}
for ident in routeIDs:
    routeIDs_dict[ident] = list(sorted(df[df['ROUTEID']==ident]['PROGRNUMBER'].unique()))

### Check that each routeid is consecutive

In [1440]:
for ident in routeIDs:
    for i in range(1, len(routeIDs_dict[ident])):
        if routeIDs_dict[ident][i] - routeIDs_dict[ident][i-1] > 1:
            print(f'{ident} is not consecutive.')
            break

In [1441]:
tripDays = df[(df['TRIPID']==df.shift(-1)['TRIPID']) &\
           (df['DAYOFSERVICE']==df.shift(-1)['DAYOFSERVICE']) &\
           (df['PROGRNUMBER']-df.shift(-1)['PROGRNUMBER']<-1)][['TRIPID', 'DAYOFSERVICE']]

In [1442]:
tripDays.reset_index(inplace=True)

- **if yes, move on to dropping all non-consecutive journeys from the dataframe**
- **if no, find out which PROGRNUMBER values are missing from the non-consecutive routeids**

- **for each entry in the tripDays dataframe, find all rows in the dataframe that have the same TRIPID and DAYOFSERVICE as the current row in the tripDays dataframe**
- **add the indexes of each of these rows to a list**

## If journey is non-consecutive

### x, y and z routeids are non-consecutive

- what stops are missing for the non-consecutive ROUTEIDs?

In [1342]:
for ident in routeIDs:
    for i in range(1, len(routeIDs_dict[ident])):
        if routeIDs_dict[ident][i] - routeIDs_dict[ident][i-1] > 1:
            print(f'{ident}: {routeIDs_dict[ident][i]} is more than 1 away from {routeIDs_dict[ident][i-1]}')

220_14: 59 is more than 1 away from 57


#### We can see from this that
- ROUTEID x is missing PROGRNUMBER y
- ROUTEID a is missing PROGRNUMBER b

### Make exception cases

In [1343]:
exception1 = df[(df['ROUTEID']=='220_14')&(df['PROGRNUMBER']==57)&\
                       (df.shift(-1)['PROGRNUMBER']==59)].index


# when df routeid is x and when the current program number is one less than y AND
# the next program number is one greater than y
# add this index to the exception list.

### When all exception cases have been made , drop each of the exceptions from the tripDays dataframe made before

In [1344]:
# list of indices to drop from the tripDays dataframe
Exception1 = []
for i in exception1:
    Exception1.append(i)

# do this for all exceptions and add to tripDayDrop list

In [1345]:
tripDayDrop = []

for i in Exception1:
    tripDayDrop.append(tripDays[tripDays['index']==i].index)


In [1346]:
TripDayDrop = []
for i in tripDayDrop:
    TripDayDrop.append(i[0])

In [1347]:
tripDays.drop(TripDayDrop, inplace=True)

In [1348]:
tripDays.drop(columns=['index'], inplace=True)

In [1349]:
tripDays.reset_index(drop=True, inplace=True)

In [1350]:
tripDays

Unnamed: 0,TRIPID,DAYOFSERVICE


# Now we can drop incomplete trips from the main dataframe

## drop incomplete trips

In [1443]:
dropIndexList = []

for i in range(tripDays.shape[0]):
#     for debugging:
#     if i % 100 == 0:
#         print(i)
    tripid = tripDays.loc[i, 'TRIPID']
    date = tripDays.loc[i, 'DAYOFSERVICE']
    dropIndex = df[(df['TRIPID']==tripid) & (df['DAYOFSERVICE']==date)].index
    dropIndexList.append(dropIndex)

In [1444]:
# make it one continuous list
DropIndexList = []

for i in dropIndexList:
    for j in i:
        DropIndexList.append(j)

In [1445]:
df.drop(df.index[DropIndexList], inplace=True)

In [1446]:
df.reset_index(drop=True, inplace=True)

### Final check for non-consecutive journeys:
- if there were non-consecutive journeys run from the 'remove all non-complete journeys' heading again

In [1447]:
tripDays = df[(df['TRIPID']==df.shift(-1)['TRIPID']) &\
           (df['DAYOFSERVICE']==df.shift(-1)['DAYOFSERVICE']) &\
           (df['PROGRNUMBER']-df.shift(-1)['PROGRNUMBER']<-1)][['TRIPID', 'DAYOFSERVICE']]

In [1448]:
tripDays

Unnamed: 0,TRIPID,DAYOFSERVICE


## target feature: journey time

In [1449]:
# make a dict
journeyTimes = {}
for i in range(df.shape[0]):
    journeyTimes[i] = 0

In [1450]:
# loop through from 1 to the end
# the value of the index is the df current row actual arrival minus the df previous row actual departure

for i in range(0, df.shape[0]-1):
    if i % 10000 == 0:
        print(i)
    if df.loc[i, 'TRIPID'] == df.loc[i+1, 'TRIPID']:
        journeyTimes[i] = (df.loc[i+1, 'ARR_ACT'] - df.loc[i, 'ARR_ACT'])

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000


In [1451]:
# make into a dataframe
journeyTimeDF = pd.DataFrame.from_dict(journeyTimes, orient='index', columns=['JOURNEYTIME'])

In [1452]:
# combine with df
df = df.join(journeyTimeDF)

## Check for negative journey times

In [1453]:
df[df['JOURNEYTIME']<0]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,ROUTEID,DIRECTION,MONTH,WEEKDAY,HOUR,JOURNEYTIME
48876,2018-04-13,6627218,2,4323,29852,33095,33105,270_42,1,4,4,8,-2713


## If there negative journeytimes:

### As we can see, we have journeys with negative JOURNEYTIME times. We will find each of these entire trips and drop them all.

In [1454]:
# get a list of the dates with the negative journeytimes
negDays = list(df[df['JOURNEYTIME']<0]['DAYOFSERVICE'])

In [1455]:
# get a list of the trip IDs with the negative journey times
negTripIDs = list(df[df['JOURNEYTIME']<0]['TRIPID'])

In [1456]:
# drop all rows that have the DAYOFSERVICE and TRIPID values at both of their respective indexes
# This means any rows that have a DAYOFSERVICE value at index 0 of the dropDays list AND a TRIPID value 
# at index 0 of the dropTripIDs list

for i in range(len(negDays)):
#     for debugging:
#     if i%10==0:
#         print(i)
    dropindex = df[(df['DAYOFSERVICE']==negDays[i])&\
                           (df['TRIPID']==negTripIDs[i])].index
    df.drop(dropindex, inplace=True)

### Final check for negative journey times

In [1457]:
df[df['JOURNEYTIME']<0]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,ROUTEID,DIRECTION,MONTH,WEEKDAY,HOUR,JOURNEYTIME


In [1458]:
df.reset_index(drop=True, inplace=True)

## Remove outliers in terms of journey time

### remove outliers in terms of the standard deviation

In [1459]:
outliersIndexList = []

for routeid in df['ROUTEID'].unique():
    
    routeidDF = df[df['ROUTEID']==routeid]
    
    for progrnumber in df[df['ROUTEID']==routeid]['PROGRNUMBER'].unique():
        
        workingDF0 = routeidDF[routeidDF['PROGRNUMBER']==progrnumber]
        
        upper = (workingDF0['JOURNEYTIME'].mean())+(3*(workingDF0['JOURNEYTIME'].std()))
        lower = (workingDF0['JOURNEYTIME'].mean())-(3*(workingDF0['JOURNEYTIME'].std()))
        
        outliers = workingDF0[(workingDF0['JOURNEYTIME']>upper)|(workingDF0['JOURNEYTIME']<lower)].index
        outliersIndexList.append(outliers)

In [1460]:
OutliersIndexList = []

for i in outliersIndexList:
    for j in i:
        OutliersIndexList.append(j)

In [1461]:
df.drop(df.index[OutliersIndexList], inplace=True)

In [1462]:
df.reset_index(inplace=True, drop=True)

## add a feature: end stop
- and rename STOPPOINTID to startStop

In [1463]:
df.rename(columns={'STOPPOINTID':'startStop'}, inplace=True)

In [1464]:
df['endStop'] = 'N/A'

In [1465]:
df.loc[(df['TRIPID']==df['TRIPID'].shift(-1)), 'endStop'] = df['startStop'].shift(-1)

## Remove all rows that have an endStop value of 'N/A'

In [1466]:
dropIndex = list(df[df['endStop']=='N/A'].index)

In [1467]:
df.drop(dropIndex, inplace=True)

In [1468]:
df.reset_index(drop=True, inplace=True)

## Merge with weather dataframe
- create DAYHOUR to combine two dataframes on

In [1469]:
df['DAYHOUR'] = df['DAYOFSERVICE'] + pd.to_timedelta(df['HOUR'], unit='H')

In [1470]:
weather = pd.read_csv('weatherData_CLEANED.csv')
weather['DAYHOUR'] = pd.to_datetime(weather['DAYHOUR'])

- drop descriptive weather features

In [1471]:
weather.reset_index(drop=True, inplace=True)

In [1472]:
weather.drop(columns=['weather_main', 'weather_description'], inplace=True)

- add precipitation feature

In [1473]:
# change nan values to 0 for rain and snow
nullRain = weather.loc[pd.isnull(weather['rain_1h']), 'rain_1h'].index
weather.loc[nullRain, 'rain_1h'] = 0

nullSnow = weather.loc[pd.isnull(weather['snow_1h']), 'snow_1h'].index
weather.loc[nullSnow, 'snow_1h'] = 0

In [1474]:
weather['precipitation_1h'] = ((weather['rain_1h']) + (weather['snow_1h']))

In [1475]:
weather.drop(columns=['rain_1h', 'snow_1h'], inplace=True)

### Change precipitation values that are NaN to 0

In [1476]:
df = pd.merge(df, weather, on=['DAYHOUR'])
df.drop(columns=['DAYHOUR'])

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,ROUTEID,DIRECTION,MONTH,WEEKDAY,HOUR,JOURNEYTIME,endStop,temp,humidity,wind_speed,precipitation_1h
0,2018-01-01,5957560,1,3333,38400,38422,38422,270_44,2,1,0,10,54,3334.0,5.02,81,9.77,0.0
1,2018-01-01,5957560,2,3334,38464,38476,38487,270_44,2,1,0,10,36,3352.0,5.02,81,9.77,0.0
2,2018-01-01,5957560,3,3352,38525,38512,38512,270_44,2,1,0,10,46,3335.0,5.02,81,9.77,0.0
3,2018-01-01,5957560,4,3335,38570,38558,38558,270_44,2,1,0,10,38,3336.0,5.02,81,9.77,0.0
4,2018-01-01,5957560,5,3336,38609,38596,38596,270_44,2,1,0,10,22,3337.0,5.02,81,9.77,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164329,2018-12-31,8579749,12,3329,49465,49707,49707,270_42,1,12,0,13,22,3330.0,9.85,71,4.10,0.0
164330,2018-12-31,8579749,13,3330,49479,49729,49729,270_42,1,12,0,13,48,3331.0,9.85,71,4.10,0.0
164331,2018-12-31,8579749,14,3331,49523,49777,49777,270_42,1,12,0,13,43,3351.0,9.85,71,4.10,0.0
164332,2018-12-31,8579749,15,3351,49558,49820,49830,270_42,1,12,0,13,107,3332.0,9.85,71,4.10,0.0


In [1477]:
df = df[['DIRECTION', 'MONTH', 'WEEKDAY', 'HOUR', 'startStop', 'endStop', 'ARR_ACT', 'DEP_ACT', 'JOURNEYTIME',\
        'temp', 'humidity', 'wind_speed', 'precipitation_1h']]

## Save df as csv file

In [1478]:
df.to_csv(f'lines/{routeNum}/{routeNum}_MODELING.csv', index_label=False)

In [1479]:
routeNum

'270'