# MODS206 - Data analysis in economics 2 - Applied Econometrics

### Leonardo Hannas de Carvalho Santos
### Yuri de Sene Alvizi 
### João Lucas Furtado Melga

* Why does the column 'hour' is filled with zeros
* What does the Day of the week (dow) column represent?
* What represents the scale of the feature weather?
* The R-squared got a bizarre value of zero when trying to predict the 3 features. What could be the reason for that?


* Do we need to clean the traffic data?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
!pip install arrow



In [3]:
traffic = pd.read_feather('codedDataset/trafficDataset.feather')

In [4]:
traffic

Unnamed: 0,city,station,weather,dow,datetime,hour,speed,vehicleType,energyConsumption,trafficCongestion,carCrash,pedestrianDeath
0,1,1,3,4,2021-04-01,0,31,2,9.35676,1,False,False
1,1,1,3,4,2021-04-01,0,38,2,8.35296,1,False,False
2,1,1,3,4,2021-04-01,0,29,3,11.29870,1,False,False
3,1,1,3,4,2021-04-01,0,42,1,6.56640,1,False,False
4,1,1,3,4,2021-04-01,0,41,2,7.99116,1,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
43952647,6,10,1,6,2021-07-31,0,40,1,6.75600,3,False,False
43952648,6,10,1,6,2021-07-31,0,48,2,7.30656,3,False,False
43952649,6,10,1,6,2021-07-31,0,43,2,7.77276,3,False,False
43952650,6,10,1,6,2021-07-31,0,44,3,8.94880,3,False,False


In [5]:
# Linear Regression to predict the number of car crashes
X = traffic.drop(['city', 'station', 'weather', 'dow', 'datetime', 'hour', 'vehicleType', 'carCrash', 'pedestrianDeath'], axis=1)
y = traffic['carCrash']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train)
results = model.fit()

print("Intercept:", results.params[0])

data = {'Feature': X.columns, 'Slope': results.params[1:]}
df_coefs = pd.DataFrame(data)
df_coefs.reset_index(drop=True, inplace=True)
df_coefs

Intercept: -0.002523591215205078


Unnamed: 0,Feature,Slope
0,speed,6.7e-05
1,energyConsumption,-8e-06
2,trafficCongestion,0.000957


In [6]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               carCrash   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2008.
Date:                Sun, 10 Mar 2024   Prob (F-statistic):               0.00
Time:                        16:18:18   Log-Likelihood:             4.7859e+07
No. Observations:            30766856   AIC:                        -9.572e+07
Df Residuals:                30766852   BIC:                        -9.572e+07
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.0025      0.00

In [7]:
# Linear Regression to predict the number of pedestrian deaths
X = traffic.drop(['city', 'station', 'weather', 'dow', 'datetime', 'hour', 'vehicleType', 'carCrash', 'pedestrianDeath'], axis=1)
y = traffic['pedestrianDeath']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train)
results = model.fit()

print("Intercept:", results.params[0])

data = {'Feature': X.columns, 'Slope': results.params[1:]}
df_coefs = pd.DataFrame(data)
df_coefs.reset_index(drop=True, inplace=True)
df_coefs

Intercept: -0.0003265372462370252


Unnamed: 0,Feature,Slope
0,speed,8e-06
1,energyConsumption,-2e-06
2,trafficCongestion,0.000136


In [8]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:        pedestrianDeath   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     275.4
Date:                Sun, 10 Mar 2024   Prob (F-statistic):          9.64e-179
Time:                        16:18:57   Log-Likelihood:             7.8759e+07
No. Observations:            30766856   AIC:                        -1.575e+08
Df Residuals:                30766852   BIC:                        -1.575e+08
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.0003   4.24e-0

In [9]:
# Linear Regression to predict the Traffic Congestion
X = traffic.drop(['city', 'station', 'dow', 'datetime', 'hour', 'vehicleType', 'trafficCongestion', 'carCrash', 'pedestrianDeath'], axis=1)
y = traffic['trafficCongestion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train)
results = model.fit()

print("Intercept:", results.params[0])

data = {'Feature': X.columns, 'Slope': results.params[1:]}
df_coefs = pd.DataFrame(data)
df_coefs.reset_index(drop=True, inplace=True)
df_coefs

Intercept: 2.9565477669747393


Unnamed: 0,Feature,Slope
0,weather,0.00886
1,speed,-0.005853
2,energyConsumption,0.002121


In [13]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      trafficCongestion   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                 5.287e+04
Date:                Sun, 10 Mar 2024   Prob (F-statistic):               0.00
Time:                        16:19:42   Log-Likelihood:            -2.7796e+07
No. Observations:            30766856   AIC:                         5.559e+07
Df Residuals:                30766852   BIC:                         5.559e+07
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 2.9565      0.00

In [10]:
# plt.figure(figsize=(12, 8))
# sns.heatmap(traffic.isnull(), cmap='coolwarm', cbar=False)
# plt.show()

In [11]:
# Correlation matrix
# plt.figure(figsize=(12, 8))
# sns.heatmap(traffic.corr(), annot=False, cmap='viridis')

In [12]:
# Residuals
# sns.histplot((y_test - predictions), bins=50)