In [1]:
import numpy as np
import datetime
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sns
import plotly.express as px
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

init_notebook_mode(connected=True) 

In [2]:
## reading in the aggregated weather data

agg_weather = pd.read_csv("https://raw.githubusercontent.com/kaberry2/DSCI689/main/datasets/agg_weather.csv").drop(['Unnamed: 0'], axis = 1)

In [3]:
agg_weather.head()


Unnamed: 0,date,temp_mean,temp_min,temp_max,humidity_mean,wind_speed_mean,rain_1h_median,rain_3h_median,snow_1h_median,snow_3h_median
0,2021-01-01,30.593429,23.468,35.708,86.571429,10.958104,0.228346,0.708661,0.344488,
1,2021-01-02,35.883846,33.044,41.9,88.307692,6.814246,0.110236,0.19685,,
2,2021-01-03,34.03,32.9,35.456,90.777778,6.575951,0.098425,,0.098425,
3,2021-01-04,33.7175,31.082,39.974,85.291667,4.085321,,,,
4,2021-01-05,32.103333,29.318,34.898,89.666667,7.928922,0.051181,,0.19685,


## Reading in all Bus-Related Data

In [4]:
base_raw_url = "https://raw.githubusercontent.com/kaberry2/DSCI689/main/datasets/{}"

mapping = pd.read_csv(base_raw_url.format("stop_loop_mapping.csv"))
buses = pd.read_csv(base_raw_url.format("buses.csv"))
loops = pd.read_csv(base_raw_url.format("loops.csv"))
stops = pd.read_csv(base_raw_url.format("stops.csv"))
users = pd.read_csv(base_raw_url.format("users.csv"))

df = pd.read_csv(base_raw_url.format("entries.csv"))

## Cleaning Up Dates

In [5]:
df[['date','time']] = df.timestamp.str.split(expand=True)
df['date'] = pd.DatetimeIndex(df['date']).date

In [6]:
df['date']

0         2021-01-16
1         2021-01-16
2         2021-01-16
3         2021-01-16
4         2021-01-16
             ...    
458786    2021-12-17
458787    2021-12-17
458788    2021-12-17
458789    2021-12-17
458790    2021-12-17
Name: date, Length: 458791, dtype: object

In [7]:
df['month']= pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

In [8]:
## I am doing it this way to keep the date columns as a date

conditions = (df['month'] == 1, df['month'] == 2, df['month'] == 3, df['month'] == 4,
              df['month'] == 5, df['month'] == 6, df['month'] == 7, df['month'] == 8,
              df['month'] == 9, df['month'] == 10, df['month'] == 11, df['month'] == 12)

values = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September',
         'October', 'November', 'December']

df['month'] = np.select(conditions, values)

In [9]:
# Index values for replacement
stops.set_index('id')['stop_name']
loops.set_index('id')['loop_name']
buses.set_index('id')['bus_number']
'Indexed!'

'Indexed!'

In [10]:
# Replace ID values with names for categorical data
df['stop_id'] = df['stop_id'].replace(stops.set_index('id')['stop_name'])
df['loop_id'] = df['loop_id'].replace(loops.set_index('id')['loop_name'])
df['bus_id'] = df['bus_id'].replace(buses.set_index('id')['bus_number'])

In [11]:
# Clean up column titles and examine current state of main data set
df.rename(columns={'stop_id':'Stop ID','boarded':'Students Boarded',
                   'loop_name':'Loop Name', 'loop_id':'Loop ID',
                   'driver_id':'Driver ID','id':'ID', 'bus_id':'Bus ID',
                   'left_behind':'Students Left Behind',
                   'time':'Time', 'month':'Month', 'day':'Day',
                   'day_of_year':'Day of Year', 'hour':'Hour'}, inplace=True)
df.drop(['is_deleted'], inplace=True, axis=1)
df

Unnamed: 0,Students Boarded,Stop ID,timestamp,date_added,Loop ID,Driver ID,ID,Students Left Behind,Bus ID,date,Time,Month,Day
0,0,Baseball #1,1/16/2021 17:00,1/16/2021,Green Loop,50,804685,0,903,2021-01-16,17:00,January,16
1,0,Kinghorn,1/16/2021 17:04,1/16/2021,Blue Loop,49,804686,0,906,2021-01-16,17:04,January,16
2,1,Rec Center,1/16/2021 17:06,1/16/2021,Blue Loop,49,804687,0,906,2021-01-16,17:06,January,16
3,0,Shaffer Tower S-B,1/16/2021 17:07,1/16/2021,Blue Loop,49,804688,0,906,2021-01-16,17:07,January,16
4,0,AJ,1/16/2021 17:08,1/16/2021,Blue Loop,49,804689,0,906,2021-01-16,17:08,January,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
458786,0,Neely & Linden St,12/17/2021 20:01,12/17/2021,Blue Loop,53,1263471,0,906,2021-12-17,20:01,December,17
458787,0,SR-2,12/17/2021 20:01,12/17/2021,Green Loop,36,1263472,0,904,2021-12-17,20:01,December,17
458788,0,SR-3,12/17/2021 20:01,12/17/2021,Green Loop,36,1263473,0,904,2021-12-17,20:01,December,17
458789,1,Music,12/17/2021 20:02,12/17/2021,Green Loop,50,1263474,0,903,2021-12-17,20:02,December,17


In [12]:
# Aggregate shuttle data by day
day_df = df.groupby(['date']).agg({'Students Boarded': ['mean', 'sum'],
                                        'Students Left Behind': 'sum'})
loop_df = df.groupby(['Loop ID', 'date']).agg({'Students Boarded': ['mean', 'sum'],
                                      'Students Left Behind': ['mean', 'sum']})


day_df.columns = ['Mean Students Boarded', 'Total Students Boarded', 'Total Students Left Behind']
day_df = day_df.reset_index()
day_df

Unnamed: 0,date,Mean Students Boarded,Total Students Boarded,Total Students Left Behind
0,2021-01-16,0.119266,39,0
1,2021-01-17,0.747989,279,13
2,2021-01-18,0.719638,557,0
3,2021-01-19,0.926695,2364,16
4,2021-01-20,0.944099,2432,13
...,...,...,...,...
190,2021-12-13,1.379477,3428,0
191,2021-12-14,1.003607,2504,0
192,2021-12-15,0.790057,2209,0
193,2021-12-16,0.748798,2024,1


In [13]:
agg_weather['date']

0      2021-01-01
1      2021-01-02
2      2021-01-03
3      2021-01-04
4      2021-01-05
          ...    
360    2021-12-27
361    2021-12-28
362    2021-12-29
363    2021-12-30
364    2021-12-31
Name: date, Length: 365, dtype: object

In [14]:
day_df['date']

0      2021-01-16
1      2021-01-17
2      2021-01-18
3      2021-01-19
4      2021-01-20
          ...    
190    2021-12-13
191    2021-12-14
192    2021-12-15
193    2021-12-16
194    2021-12-17
Name: date, Length: 195, dtype: object

In [15]:
day_df['date'] = pd.to_datetime(day_df['date'])

agg_weather['date'] = pd.to_datetime(agg_weather['date'])

In [16]:
merged_df = day_df.merge(agg_weather)

In [17]:
merged_df

Unnamed: 0,date,Mean Students Boarded,Total Students Boarded,Total Students Left Behind,temp_mean,temp_min,temp_max,humidity_mean,wind_speed_mean,rain_1h_median,rain_3h_median,snow_1h_median,snow_3h_median
0,2021-01-16,0.119266,39,0,30.842214,27.878,33.260,89.857143,6.643091,0.118110,,0.110236,0.708661
1,2021-01-17,0.747989,279,13,32.695520,30.416,34.178,88.680000,10.226669,0.127953,,0.096457,0.118110
2,2021-01-18,0.719638,557,0,29.218250,27.068,31.784,84.625000,9.906182,,,0.043307,
3,2021-01-19,0.926695,2364,16,29.165000,20.138,37.832,77.375000,7.610460,,,,
4,2021-01-20,0.944099,2432,13,30.550250,24.782,36.140,71.416667,13.106956,0.104331,,0.039370,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,2021-12-13,1.379477,3428,0,39.091250,33.224,49.856,65.375000,5.545896,,,,
191,2021-12-14,1.003607,2504,0,41.861000,31.712,55.256,66.750000,6.132176,,,,
192,2021-12-15,0.790057,2209,0,54.262250,48.470,61.592,63.958333,7.955331,0.250000,,,
193,2021-12-16,0.748798,2024,1,55.232462,45.824,58.208,78.115385,12.318127,0.448819,,,


In [18]:
## scaling the data for a better visualization

from sklearn import preprocessing

scaled_mean_students = preprocessing.scale(merged_df['Mean Students Boarded'])
scaled_mean_temp = preprocessing.scale(merged_df['temp_mean'])
scaled_1h_rain = preprocessing.scale(merged_df['rain_1h_median'])
scaled_1h_snow = preprocessing.scale(merged_df['snow_1h_median'])

fig = go.Figure()
fig.add_trace(go.Scatter(name = "Mean Students Boarded", x = merged_df["date"], y = scaled_mean_students, mode = "markers"))
fig.add_trace(go.Scatter(name = "Mean Temperature", x = merged_df["date"], y = scaled_mean_temp, mode = "markers"))
fig.add_trace(go.Scatter(name = "Mean 1HR Rain", x = merged_df["date"], y = scaled_1h_rain, mode = "markers"))
fig.add_trace(go.Scatter(name = "Mean 1HR Snow", x = merged_df["date"], y = scaled_1h_snow, mode = "markers"))

fig.show()

## Scatter Between Number of Students Boarded and Other Weather Variables

In [19]:
loop_df.columns = ['Mean Students Boarded', 'Total Students Boarded', 'Mean Students Left Behind', 'Total Students Left Behind']
loop_df

loop_df.reset_index(inplace= True)


In [20]:
loop_df['date'] = pd.to_datetime(loop_df['date'])

In [21]:
merged_loop_df = loop_df.merge(agg_weather)

In [22]:
merged_loop_df

Unnamed: 0,Loop ID,date,Mean Students Boarded,Total Students Boarded,Mean Students Left Behind,Total Students Left Behind,temp_mean,temp_min,temp_max,humidity_mean,wind_speed_mean,rain_1h_median,rain_3h_median,snow_1h_median,snow_3h_median
0,Blue Loop,2021-01-16,0.009804,2,0.000000,0,30.842214,27.878,33.260,89.857143,6.643091,0.118110,,0.110236,0.708661
1,Green Loop,2021-01-16,0.300813,37,0.000000,0,30.842214,27.878,33.260,89.857143,6.643091,0.118110,,0.110236,0.708661
2,Blue Loop,2021-01-17,0.069869,16,0.000000,0,32.695520,30.416,34.178,88.680000,10.226669,0.127953,,0.096457,0.118110
3,Green Loop,2021-01-17,1.826389,263,0.090278,13,32.695520,30.416,34.178,88.680000,10.226669,0.127953,,0.096457,0.118110
4,Blue Loop,2021-01-18,0.097222,21,0.000000,0,29.218250,27.068,31.784,84.625000,9.906182,,,0.043307,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
864,Sunday Orange,2021-11-28,1.857143,663,0.014006,5,36.239000,31.334,41.774,73.958333,11.170087,,,,
865,Sunday Green,2021-12-05,1.297980,257,0.000000,0,37.144250,29.336,52.304,70.833333,7.699008,0.098425,0.098425,,
866,Sunday Orange,2021-12-05,1.743440,598,0.017493,6,37.144250,29.336,52.304,70.833333,7.699008,0.098425,0.098425,,
867,Sunday Green,2021-12-12,1.371981,284,0.000000,0,35.405750,29.552,46.616,74.083333,7.798741,,,,


In [23]:
fig2 = px.scatter(merged_loop_df, x = "date", y = "Total Students Boarded", color = "Loop ID")



fig2.show()