## BSU Transit System Weather Analysis

First we load in our packages and read our data.

In [None]:
!conda install plotly

In [None]:
import numpy as np
import datetime
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sns
from pathlib import Path
from plotly import tools
import plotly.express as px
import plotly.graph_objs as go

In [None]:
# Add data set that will represent the majority of the data
df = pd.DataFrame(pd.read_csv("entries.csv"))

In [None]:
# Create date, time, and timestamp columns
df[['date','time']] = df.timestamp.str.split(expand=True)

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Add month, day, and day of year columns
df['Month']=df['date'].dt.month_name()
df['day'] = pd.DatetimeIndex(df['date']).day

In [None]:
# Load in all other data sets from our shuttle data
mapping = pd.DataFrame(pd.read_csv("stop_loop_mapping.csv"))
buses = pd.DataFrame(pd.read_csv("buses.csv"))
loops = pd.DataFrame(pd.read_csv("loops.csv"))
stops = pd.DataFrame(pd.read_csv("stops.csv"))
users = pd.DataFrame(pd.read_csv("users.csv"))

In [None]:
# Index values for replacement
stops.set_index('id')['stop_name']
loops.set_index('id')['loop_name']
buses.set_index('id')['bus_number']
'Indexed!'

In [None]:
# Replace ID values with names for categorical data
df['stop_id'] = df['stop_id'].replace(stops.set_index('id')['stop_name'])
df['loop_id'] = df['loop_id'].replace(loops.set_index('id')['loop_name'])
df['bus_id'] = df['bus_id'].replace(buses.set_index('id')['bus_number'])

In [None]:
# Clean up column titles and examine current state of main data set
df.rename(columns={'stop_id':'Stop ID','boarded':'Students Boarded',
                   'loop_name':'Loop Name', 'loop_id':'Loop ID',
                   'driver_id':'Driver ID','id':'ID', 'bus_id':'Bus ID',
                   'left_behind':'Students Left Behind',
                   'time':'Time', 'month':'Month', 'day':'Day',
                   'day_of_year':'Day of Year', 'hour':'Hour'}, inplace=True)
df.drop(['is_deleted'], inplace=True, axis=1)
df

In [None]:
# Aggregate shuttle data by day
day_df = df.groupby(['date','Loop ID']).agg({'Students Boarded': ['mean', 'sum'],
                                        'Students Left Behind': 'sum'})
day_df.columns = ['Mean Students Boarded', 'Total Students Boarded', 'Total Students Left Behind']
day_df = day_df.reset_index()
day_df

In [None]:
# Ensure categorical data is treated as such
for col in ['Stop ID', 'Loop ID', 'Bus ID']:
    df[col] = df[col].astype('category')

In [None]:
# Check the data types for each column
df.dtypes

In [None]:
# Load in weather data to begin cleaning it
weather_df = pd.read_csv("https://raw.githubusercontent.com/kaberry2/DSCI689/main/datasets/muncie_weather.csv")
weather_df.head()

In [None]:
# Cleaning and grouping dates within weather data
weather_df['timestamp'] = pd.to_datetime(weather_df['dt'], unit = 's')
weather_df['hour'] = pd.DatetimeIndex(weather_df['timestamp']).hour
weather_df['date'] = pd.DatetimeIndex(weather_df['timestamp']).date
weather_df.loc[:, ['hour', 'date']]

In [None]:
# Convert from Kelvin to Fahrenheit
weather_df['temp'] = (weather_df['temp']-273.15)*1.8 + 32
weather_df['rain_1h'] = weather_df['rain_1h']/2.54
weather_df['rain_3h'] = weather_df['rain_3h']/2.54
weather_df['snow_1h'] = weather_df['snow_1h']/2.54
weather_df['snow_3h'] = weather_df['snow_3h']/2.54
weather_df['wind_speed'] = weather_df['wind_speed']*2.237

In [None]:
# Group based on established aggregation of day
aggregated_weather_df = weather_df.groupby('date').agg({'temp': ['mean', 'min', 'max'],
                                                       'humidity': 'mean',
                                                       'wind_speed': 'mean',
                                                       'rain_1h': 'median',
                                                       'rain_3h': 'median',
                                                       'snow_1h': 'median',
                                                       'snow_3h': 'median'})

aggregated_weather_df.columns = ['temp_mean', 'temp_min', 'temp_max', 'humidity_mean', 'wind_speed_mean', 
                                'rain_1h_median', 'rain_3h_median', 'snow_1h_median', 'snow_3h_median']

aggregated_weather_df = aggregated_weather_df.reset_index()
aggregated_weather_df

In [None]:
# Export as a new data set
path = Path("C:/Users/kimbe/Documents/School/Spring 2022/DSCI 689/HW 1/agg_weather.csv")
path.parent.mkdir(parents = True, exist_ok = True)
aggregated_weather_df.to_csv(path)

In [None]:
# Load in cleaned weather data set after it was uploaded to github shared space
weather = pd.read_csv("agg_weather.csv")

In [None]:
weather['date'] = pd.to_datetime(weather['date'])
weather

In [None]:
day_df['date'] = pd.to_datetime(day_df['date'])

merged_df = day_df.merge(weather)

In [None]:
#merged_df = pd.merge(day_df, weather, how='outer', on = 'date')
merged_df

In [None]:
merged_df.drop(['Unnamed: 0'], inplace=True, axis=1)

In [None]:
# Clean up column titles and examine current state of main data set
merged_df.rename(columns={'date':'Date','temp_mean':'Temp Mean','temp_min':'Temp Low',
                          'temp_max':'Temp High', 'humidity_mean':'Humidity Mean',
                          'wind_speed_mean':'Wind Speed Mean','rain_1h_median':'Rain One Hour Median',
                          'rain_3h_median':'Rain Three Hour Median', 'snow_1h_median':'Snow One Hour Median',
                          'snow_3h_median':'Snow Three Hour Median'}, inplace=True)
merged_df

In [None]:
df = merged_df
df['Month']=df['Date'].dt.month_name()

In [None]:
df['Season']=np.where((df['Month'].isin(['December','January','February'])), 'Winter',
                                    np.where((df['Month'].isin(['March','April','May'])), 'Spring',
                                              np.where((df['Month'].isin(['June','July','August'])),'Summer',
                                                       np.where((df['Month'].isin(['September','October','November'])),'Fall',
                                                               'Other'))))
df

In [None]:
# Export as a new data set
path = Path("C:/Users/kimbe/Documents/School/Spring 2022/DSCI 689/HW 1/merged_df.csv")
path.parent.mkdir(parents = True, exist_ok = True)
df.to_csv(path)

In [None]:
loop_sum = df['Total Students Boarded'].groupby([df['Loop ID']]).sum()

In [None]:
color_set = ['0.80','0.60','0.40','0.20']
loop_sum.plot.pie(normalize=True, autopct='%1.1f%%', colors=color_set)
# From this graph we can see the Demand Response loop is the least busy loop.

In [None]:
season_sum = df['Total Students Boarded'].groupby([df['Season']]).sum()

In [None]:
color_set = ['0.80','0.60','0.40','0.20']
season_sum.plot.pie(normalize=True, autopct='%1.1f%%', colors=color_set)

From this, we can see that the fall 2021 semester was more than twice as busy as the spring 2021 semester.

In [None]:
loop_by_day = df.groupby(["Date", "Temp High"])["Total Students Boarded"].mean()
loop_by_day = pd.DataFrame(loop_by_day)
loop_by_day

In [None]:
loop_by_day.sort_values("Total Students Boarded", ascending=False)

In [None]:
## scaling the data for a better visualization

from sklearn import preprocessing

scaled_mean_students = preprocessing.scale(merged_df['Mean Students Boarded'])
scaled_mean_temp = preprocessing.scale(merged_df['Temp Mean'])
scaled_1h_rain = preprocessing.scale(merged_df['Rain One Hour Median'])
scaled_1h_snow = preprocessing.scale(merged_df['Snow One Hour Median'])

fig = go.Figure()
fig.add_trace(go.Scatter(name = "Mean Students Boarded", x = merged_df["date"], y = scaled_mean_students, mode = "markers"))
fig.add_trace(go.Scatter(name = "Mean Temperature", x = merged_df["date"], y = scaled_mean_temp, mode = "markers"))
fig.add_trace(go.Scatter(name = "Rain One Hour Median", x = merged_df["date"], y = scaled_1h_rain, mode = "markers"))
fig.add_trace(go.Scatter(name = "Snow One Hour Median", x = merged_df["date"], y = scaled_1h_snow, mode = "markers"))

fig.show()

In [None]:
fig2 = px.scatter(merged_loop_df, x = "date", y = "Total Students Boarded", color = "Loop ID")

fig2.show()