## Python Regression Example

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import statsmodels.formula.api as sm
import sklearn.linear_model
import scipy, scipy.stats
import matplotlib.pyplot as plt
from matplotlib.pyplot import scatter
%matplotlib inline

In [None]:
# Set up the API call to get daily weather data for Toronto from Environment Canada for 2015
url = "http://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=31688&Year=2015&timeframe=2&submit=Download+Data"

In [None]:
# Download the data
toronto_island_weather_2015 = pd.read_csv(url, skiprows=25)
toronto_island_weather_2015

In [None]:
toronto_island_weather_2015["Date"] = pd.to_datetime(toronto_island_weather_2015["Date/Time"])
toronto_island_weather_2015.ix[0].Date

In [None]:
# Get data on watermain breaks in Toronto
breaks = pd.read_excel("WatermainBreaks.xlsx", usecols=[0], names=["Date"])
breaks

In [None]:
# Count the breaks per day and limit the dataset to 2015
counts = breaks.Date.value_counts()
counts_2015 = counts[pd.date_range(start="2015-01-01", end="2015-12-31")]
counts_2015

In [None]:
break_counts = counts_2015.to_frame().reset_index().rename(columns={"Date": "Count", "index": "Date"})
break_counts

In [None]:
break_counts_and_temps = break_counts.merge(toronto_island_weather_2015, left_on="Date", right_on="Date", how="left")[["Date", "Count", "Min Temp (°C)"]]
break_counts_and_temps

In [None]:
plt.scatter(x=break_counts_and_temps["Min Temp (°C)"], y=break_counts_and_temps["Count"])

In [None]:
# Set up the regression design matrix for statsmodels
dm = break_counts_and_temps.dropna().copy() # statsmodels doesn't like NaN's; need to copy so dm is not a view (or next line won't work)
dm["Intercept"] = np.ones((len(dm),)) # statsmodels requires a column of 1's if you want an intercept
dm = dm.rename(columns={"Min Temp (°C)": "Min Temp (C)"}) # it also doesn't like funny characters like ° in column names (may be ok in Python 3)

Y = dm["Count"]
X = dm[["Min Temp (C)", "Intercept"]]

# Run the regression
result = sm.OLS(Y, X).fit()

# Show the results
result.summary()

In [None]:
plt.scatter(x=break_counts_and_temps["Min Temp (°C)"], y=break_counts_and_temps["Count"])
plt.plot([-25, 20], [5.1763+25*0.208, 5.1763-20*0.208], 'k-')

In [None]:
# scikit-learn needs the data organized as numpy vectors
Y = dm["Count"]
X = dm["Min Temp (C)"].values.reshape(-1, 1)
model = sklearn.linear_model.LinearRegression().fit(X, Y)
print model.coef_
print model.intercept_

In [None]:
# Residual plot
scatter(X, model.predict(X) - Y)
plt.xlabel("Min Temp (C)")
plt.ylabel("Residuals")
plt.plot([-25, 20], [0, 0], 'k-')