In [1]:
import pandas as pd
import datetime as dt
from matplotlib import pyplot as plt

In [53]:
routeNum = '68'
df = pd.read_csv(f'{routeNum}/{routeNum}_DF.csv')

In [54]:
df.columns

Index(['DATASOURCE', 'DAYOFSERVICE', 'TRIPID', 'PROGRNUMBER', 'STOPPOINTID',
       'PLANNEDTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_ARR',
       'ACTUALTIME_DEP', 'VEHICLEID', 'PASSENGERS', 'PASSENGERSIN',
       'PASSENGERSOUT', 'DISTANCE', 'SUPPRESSED', 'JUSTIFICATIONID',
       'LASTUPDATE', 'NOTE', 'ROUTEID', 'DIRECTION'],
      dtype='object')

# Prepare a data quality plan for the cleaned CSV file.

## List of issues identified in the data quality Report:
- Constant columns: DATASOURCE has just 1 unique value, so it is a constant column
- Empty columns: PASSENGERS, PASSENGERSIN, PASSENGERSOUT, DISTANCE, and NOTE all have 0 values, so they are empty columns
- Duplicate columns: ARR_PLAN and DEP_PLAN are duplicates of one another
- Missing values: SUPPRESSED and JUSTIFICATIONID are missing over 99% of their values

## Propose solutions to deal with the problems identified.  

#### 1. Constant columns
- DATASOURCE will be dropped as it only contains one value, that is not useful for prediction.

#### 2. Empty columns
- PASSENGERS
- PASSENGERSIN
- PASSENGERSOUT
- DISTANCE
- NOTE
<br>

- **each of these columns will be dropped as they are all completely empty**

#### 3. Duplicate columns
- DEP_PLAN will be dropped as it is a duplicate column of ARR_PLAN.
- ARR_PLAN will be renamed to ARR/DEP_PLAN

#### 4. Missing values
- trips with a SUPPRESSED value of 1.0 will be dropped
    - Less than 0.5% of the entries in the database are suppressed, and suppressed journeys will corrupt the prediction of journey times.
- SUPPRESSED will be dropped as it is missing over 99% of its values
- JUSTIFICATIONID will be dropped as it is missing over 99% of its values

#### 5. Irrelevant values
- The LASTUPDATE value is irrelevant. It only refers to the last day that the data was updated. It is not relevant to predicting journey times, so we will drop the column.
- It is unlikely that we will be able to know which VEHICLEID is scheduled for each day, so it will not have much use in predicting arrival times of buses. Therefore, we will drop the column

In [55]:
# Constant columns
df.drop(columns=['DATASOURCE'], inplace=True)

In [56]:
# Empty columns
df.drop(columns=['PASSENGERS', 'PASSENGERSIN', 'PASSENGERSOUT', 'DISTANCE', 'NOTE'], inplace=True)

In [57]:
# Duplicate columns
df.drop(columns=['PLANNEDTIME_DEP'], inplace=True)

In [58]:
# rename DEP_PLAN column
df.rename(columns={'PLANNEDTIME_ARR':'ARR/DEP_PLAN'}, inplace=True)
df.rename(columns={'ACTUALTIME_ARR':'ARR_ACT'}, inplace=True)
df.rename(columns={'ACTUALTIME_DEP':'DEP_ACT'}, inplace=True)

In [59]:
# first drop any trips that have a SUPPRESSED value of 1.0

# get all DAYOFSERVICE and TRIPID that have a SUPPRESSED value of 1.0
suppressed = df[df['SUPPRESSED']==1.0][['TRIPID', 'DAYOFSERVICE']]

In [60]:
suppressed.reset_index(drop=True, inplace=True)

In [61]:
suppressed.shape[0]

264

In [62]:
dropIndex = []
for i in range(suppressed.shape[0]):
    if i % 10==0:
        print(i)
    workingDF = df[(df['TRIPID']==suppressed.loc[i,'TRIPID'])&\
                  (df['DAYOFSERVICE']==suppressed.loc[i,'DAYOFSERVICE'])]
    dropIndex.append(workingDF.index)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260


In [63]:
DropIndex = []
for i in dropIndex:
    for j in i:
        DropIndex.append(j)  

In [64]:
df.drop(DropIndex, inplace=True)

In [65]:
df.reset_index(drop=True, inplace=True)

In [66]:
# Then drop SUPPRESSED and JUSTIFICATIONID columns
df.drop(columns=['SUPPRESSED', 'JUSTIFICATIONID'], inplace=True)

In [67]:
# drop LASTUPDATE
df.drop(columns=['LASTUPDATE'], inplace=True)

In [68]:
# drop VEHICLEID
df.drop(columns=['VEHICLEID'], inplace=True)

## Save the cleaned dataframe to a csv

In [69]:
df.to_csv(f'{routeNum}/{routeNum}_DQP_cleanedCSV.csv', index_label=False)