In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/2019_mbta.csv')

In [3]:
data.head()

Unnamed: 0,ObjectId,station_name,avg_weekday_entries,residents,jobs,potential
0,13.0,Alewife,11514.0,3419.0,1432.0,4851.0
1,35.0,Andrew,5721.0,20312.0,5263.0,25575.0
2,39.0,Aquarium,5130.0,72652.0,123003.0,195655.0
3,30.0,Arlington,6813.0,136870.0,84792.0,221662.0
4,19.0,Ashmont,8841.0,28878.0,3103.0,31981.0


In [13]:
data.describe()

Unnamed: 0,ObjectId,avg_weekday_entries,residents,jobs,potential,offset
count,55.0,55.0,55.0,55.0,55.0,55.0
mean,28.0,8335.109091,60187.4,37914.8,98102.2,-89767.090909
std,16.02082,5323.085918,55279.191396,55291.736824,106778.899699,104653.192049
min,1.0,521.0,1522.0,368.0,4111.0,-338657.0
25%,14.5,4583.0,18209.0,3126.0,21729.5,-150929.5
50%,28.0,7041.0,34317.0,6293.0,42897.0,-37045.0
75%,41.5,11317.5,97816.0,58182.5,160572.0,-15520.5
max,55.0,24639.0,177919.0,187297.0,355228.0,6663.0


In [14]:
# NaN values
# show the NaN values
data.isnull().sum()
# drop the NaN values
data = data.dropna()
# show the NaN values again
data.isnull().sum()


ObjectId               0
station_name           0
avg_weekday_entries    0
residents              0
jobs                   0
potential              0
offset                 0
dtype: int64

In [10]:
# plot the data of potential and avg_weekday_entries draw a linear regression 

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression


# plot the data in an interactive plot where I can hover over the data and see the values
import plotly.express as px

# Draw a linear line of the data


# Let us draw the offset of the data
data["offset"] = data["avg_weekday_entries"] - data["potential"]

# draw as a bar chart

# Let us first show the data that has positive offset
positives = data[data["offset"] > 0]
negatives = data[data["offset"] < 0]

# bar chart of the positives
fig = px.bar(positives, x="station_name", y="offset", hover_data=["station_name"])
fig.update_layout(title="Boston MBTA Ridership by Station Potential")
fig.show()

# bar chart of the negatives
fig = px.bar(negatives, x="station_name", y="offset", hover_data=["station_name"])
fig.update_layout(title="Boston MBTA Ridership by Station Potential")
fig.show()

# draw a line of the avg_weekday_entries and potential



In [29]:
# create a linear regression model that uses the jobs and the residents to predict the ridership

x = data[["jobs", "residents"]]
y = data["avg_weekday_entries"]

model = LinearRegression()
model.fit(x, y)

predicted = model.predict(x)
data["predicted"] = predicted


# plot the real and predicted values in an interactive plot but also the linear regression line
# Create scatter plot of actual vs predicted values
fig = px.scatter(data, x="avg_weekday_entries", y="predicted", 
                hover_data=["station_name"],
                labels={"avg_weekday_entries": "Actual Ridership",
                        "predicted": "Predicted Ridership"})

# Add the diagonal line representing perfect predictions
fig.add_scatter(x=[data["avg_weekday_entries"].min(), data["avg_weekday_entries"].max()],
                y=[data["avg_weekday_entries"].min(), data["avg_weekday_entries"].max()],
                mode='lines',
                name='Perfect Prediction',
                line=dict(color='red', dash='dash'))


fig.update_layout(
    title="Actual vs Predicted Ridership",
    xaxis_title="Actual Ridership",
    yaxis_title="Predicted Ridership"
)
fig.show()





In [37]:
# give me the most positively offsetted station according to the predicted ridership


# give me the top 3 most positively offsetted station

data["offset"] = data["predicted"] - data["avg_weekday_entries"]

predicted_positives = data[data["predicted"] > data["avg_weekday_entries"]]

# give me the most negatively offsetted station according to the predicted ridership

predicted_negatives = data[data["predicted"] < data["avg_weekday_entries"]]

top_cnt = 8

top_positives = predicted_positives.sort_values(by="offset", ascending=False).head(top_cnt)

top_negatives = predicted_negatives.sort_values(by="offset", ascending=True).head(top_cnt)

print(top_positives["station_name"])

print(top_negatives["station_name"])





8               Bowdoin
2              Aquarium
14            Chinatown
49        Suffolk Downs
21    Government Center
9              Boylston
54          Wood Island
44           Savin Hill
Name: station_name, dtype: object
23              Harvard
46        South Station
18    Downtown Crossing
12              Central
28          Kendall/MIT
34        North Station
6              Back Bay
20         Forest Hills
Name: station_name, dtype: object


In [34]:
# check chination row

chinatown_row = data[data["station_name"] == "Chinatown"]

# show using pandas dataframe

chinatown_row



Unnamed: 0,ObjectId,station_name,avg_weekday_entries,residents,jobs,potential,offset,predicted
14,34.0,Chinatown,5747.0,158572.0,132902.0,291474.0,6530.872193,12277.872193


In [38]:
downtown_crossing_row = data[data["station_name"] == "Back Bay"]

downtown_crossing_row





Unnamed: 0,ObjectId,station_name,avg_weekday_entries,residents,jobs,potential,offset,predicted
6,7.0,Back Bay,15646.0,112848.0,71037.0,183885.0,-6135.085569,9510.914431
