In [55]:
# Import Dependencies
import pandas as pd
import plotly.express as plt
import os
import statsmodels.api as sm

In [56]:
bikecounts_file = os.path.join('../','Resources', 'bikecounts.csv')
bikecounts_df = pd.read_csv(bikecounts_file)
bikecounts_df['Day'] = pd.to_datetime(bikecounts_df['Day'])
bikecounts_df.head()


Unnamed: 0.1,Unnamed: 0,Date,Day,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total
0,0,2016-04-01,2016-04-01,78.1,66.0,0.01,1704.0,3126,4115.0,2552.0,11497
1,1,2016-04-02,2016-04-02,55.0,48.9,0.15,827.0,1646,2565.0,1884.0,6922
2,2,2016-04-03,2016-04-03,39.9,34.0,0.09,526.0,1232,1695.0,1306.0,4759
3,3,2016-04-04,2016-04-04,44.1,33.1,0.47 (S),521.0,1067,1440.0,1307.0,4335
4,4,2016-04-05,2016-04-05,42.1,26.1,0,1416.0,2617,3081.0,2357.0,9471


## Hypothesis
1. The number of bikes that cross brooklyn bridge in a day is dependent on the temperature and the precipitation.
2. The lower the temperature and the higher the precipitation, the lower the number of bikes crossing the brooklyn bridge.


## Null Hypothesis
1. The number of bikers crossing the bridge does not depend on the precipitation and temperature therefore a reduction in temperature and an increase in precipitation will not reduce the number of bikers crossing the brooklyn bridge.

In [57]:
# Remove the index column
bikecounts_df.drop(columns=['Unnamed: 0'], inplace=True)

# Calculate the median temperature
bikecounts_df['Median Temp (°F)']=(bikecounts_df['High Temp (°F)']+bikecounts_df['Low Temp (°F)'])/2

# Define function to remove unwanted values from column
def remove_unwanted(x):
    for unwanted_character in ['T', '(S)', ' ']:
        x=x.replace(unwanted_character, '')
    return x.strip()
# Apply function to Precipitation column and change data type to float
bikecounts_df['Precipitation']=bikecounts_df['Precipitation'].apply(remove_unwanted)
bikecounts_df['Precipitation']=bikecounts_df['Precipitation'].replace('',0).astype('float64')
bikecounts_df.head(10)

Unnamed: 0,Date,Day,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total,Median Temp (°F)
0,2016-04-01,2016-04-01,78.1,66.0,0.01,1704.0,3126,4115.0,2552.0,11497,72.05
1,2016-04-02,2016-04-02,55.0,48.9,0.15,827.0,1646,2565.0,1884.0,6922,51.95
2,2016-04-03,2016-04-03,39.9,34.0,0.09,526.0,1232,1695.0,1306.0,4759,36.95
3,2016-04-04,2016-04-04,44.1,33.1,0.47,521.0,1067,1440.0,1307.0,4335,38.6
4,2016-04-05,2016-04-05,42.1,26.1,0.0,1416.0,2617,3081.0,2357.0,9471,34.1
5,2016-04-06,2016-04-06,45.0,30.0,0.0,1885.0,3329,3856.0,2849.0,11919,37.5
6,2016-04-07,2016-04-07,57.0,53.1,0.09,1276.0,2581,3282.0,2457.0,9596,55.05
7,2016-04-08,2016-04-08,46.9,44.1,0.01,1982.0,3455,4113.0,3194.0,12744,45.5
8,2016-04-09,2016-04-09,43.0,37.9,0.09,504.0,997,1507.0,1502.0,4510,40.45
9,2016-04-10,2016-04-10,48.9,30.9,0.0,1447.0,2387,3132.0,2160.0,9126,39.9


 ## Linear Regression Values

In [61]:
## Calculate the median temperature and append column to dataframe
bikecounts_df['Median Temperature (°F)']=(bikecounts_df['High Temp (°F)']+ bikecounts_df['Low Temp (°F)'])/2

#Define dependent and independent variables of the analysis

independent_variable=bikecounts_df[['Median Temperature (°F)', 'Precipitation']]
dependent_variable=bikecounts_df['Brooklyn Bridge']

# Add a constant to the independent variable
X=sm.add_constant(independent_variable)

regression_model=sm.OLS(dependent_variable,X)
results=regression_model.fit()

In [62]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:        Brooklyn Bridge   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.576
Method:                 Least Squares   F-statistic:                     20.70
Date:                Fri, 08 Oct 2021   Prob (F-statistic):           3.55e-06
Time:                        22:36:53   Log-Likelihood:                -235.22
No. Observations:                  30   AIC:                             476.4
Df Residuals:                      27   BIC:                             480.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                    -

In [63]:
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.576
Dependent Variable:,Brooklyn Bridge,AIC:,476.4342
Date:,2021-10-08 22:36,BIC:,480.6378
No. Observations:,30,Log-Likelihood:,-235.22
Df Model:,2,F-statistic:,20.7
Df Residuals:,27,Prob (F-statistic):,3.55e-06
R-squared:,0.605,Scale:,420260.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,-237.9800,697.7538,-0.3411,0.7357,-1669.6526,1193.6925
Median Temperature (°F),51.1740,12.4783,4.1010,0.0003,25.5705,76.7774
Precipitation,-4395.3591,1208.1137,-3.6382,0.0011,-6874.2036,-1916.5145

0,1,2,3
Omnibus:,3.614,Durbin-Watson:,1.016
Prob(Omnibus):,0.164,Jarque-Bera (JB):,2.743
Skew:,-0.741,Prob(JB):,0.254
Kurtosis:,3.026,Condition No.:,571.0


## Conclusion
A lower P value rejects the null hypthesis. From our linear regression results, there is a P-value near 0 for median temperature and Precipitation. The P-value for Precipitation being 0.0011 and that of  Median Temperature being 0.0003. Judging from these values, we can reject the null hypothesis. There is a clear relationship between the Temperature and Precipitation, and the number of bikes crossing the brooklyn bridge.