# Time series with Houston weather data

I pulled the average monthly temperature for the Houston area from 2002 to 2020 from the [National Weather Service](https://w2.weather.gov/climate/xmacis.php?wfo=hgx).

In [240]:
import pandas as pd
import numpy as np

%matplotlib inline
import plotly.express as px 

In [4]:
df = pd.read_excel("/Users/jnesnky/Downloads/temp.xlsx")

In [5]:
# Using the head method we see that the data is formatted with columns for each month. 
# This formatting is not optimized for visualization, so we "melt" the data into a tidy format. 

df.head()

Unnamed: 0,Year,1,2,3,4,5,6,7,8,9,10,11,12
0,2000,56.5,61.7,66.4,67.9,78.1,81.3,85.2,84.8,79.4,70.9,57.6,47.6
1,2001,49.3,59.3,56.4,71.7,75.9,80.5,83.5,83.5,77.0,66.9,63.4,55.9
2,2002,54.5,50.7,61.3,73.5,77.0,81.6,84.5,83.9,79.7,71.5,58.8,54.5
3,2003,50.1,53.8,61.4,70.5,80.7,82.7,83.3,84.7,77.7,71.7,65.0,53.8
4,2004,54.7,53.5,67.3,69.5,76.9,81.1,84.6,83.1,81.2,77.5,62.0,53.8


In [9]:
# Melting the data transitions the data from "wide" format to "long" format

df1 = df.melt(id_vars=['Year'], var_name='Month')

In [10]:
df1

Unnamed: 0,Year,Month,value
0,2000,1,56.5
1,2001,1,49.3
2,2002,1,54.5
3,2003,1,50.1
4,2004,1,54.7
...,...,...,...
247,2016,12,58.6
248,2017,12,53.7
249,2018,12,55.6
250,2019,12,57.6


In [30]:
# Initial plot of temperature 

fig = px.line(df1, x='Month', y='value', color='Year' )
fig.add_shape(
        # Line reference to the axes
            type="line",
            xref="x",
            yref="y",
            x0=1,
            y0=df1.value.mean(),
            x1=12,
            y1=df1.value.mean(),
            line=dict(
                color="black",
                width=3,
            ),
        )

In [42]:
# We use a simple linear regression to forecast tempreature. 

from sklearn.linear_model import LinearRegression

g = df1.dropna()
X = g[['Month']]
y = g[['value']]

clf = LinearRegression()
clf.fit(X,y)


g["pred_lin_regr"] = clf.predict(X) 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [47]:
g.head()

Unnamed: 0,Year,Month,value,pred_lin_regr
0,2000,1,56.5,66.528057
1,2001,1,49.3,66.528057
2,2002,1,54.5,66.528057
3,2003,1,50.1,66.528057
4,2004,1,54.7,66.528057


In [60]:
val = list(range(1,13))
val = pd.DataFrame(val)

In [64]:
val["pred"] = clf.predict(val) 

In [237]:
colour = ['lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey']

In [239]:
fig = px.line(df1, x='Month', y='value', color='Year', color_discrete_sequence = colour )

fig2 = px.line(df2, x="month", y="value", color = "variable")

fig.add_trace(fig2.data[0])
fig.add_trace(fig2.data[1])
fig.add_trace(fig2.data[2])
fig.add_trace(fig2.data[3])
fig.add_trace(fig2.data[4])

fig.write_html("houstontemp.html")

In [213]:
df2.reset_index(inplace=True)

In [152]:
fig2

fig3 = px.scatter(df1, x="Month", y="value")

fig2.add_trace(fig3.data[0])

In [97]:
val.columns = ["month", "pred", "mean", "mean+1"]

In [80]:
a = pd.pivot_table(df1, values='value', index=['Month'], aggfunc=np.mean)

In [95]:
val["a"] = val["mean"].shift(1)

In [85]:
a =a.reset_index()

In [109]:
val["overall_mean"] = df1["value"].mean()

In [89]:
a

Unnamed: 0,Month,value
0,1,53.47619
1,2,57.314286
2,3,63.985714
3,4,70.180952
4,5,77.314286
5,6,82.97619
6,7,84.804762
7,8,85.333333
8,9,80.561905
9,10,71.93


In [146]:
df2 = val.melt(id_vars="month")

In [111]:
df2

Unnamed: 0,month,variable,value
0,1,pred,66.528057
1,2,pred,67.257934
2,3,pred,67.987812
3,4,pred,68.717689
4,5,pred,69.447567
5,6,pred,70.177444
6,7,pred,70.907322
7,8,pred,71.637199
8,9,pred,72.367077
9,10,pred,73.096954


In [122]:
dumb = pd.get_dummies(df1["Month"], prefix="month")

df3 = df1.merge(dumb, left_index=True, right_index=True, how='outer')

In [131]:
from sklearn.linear_model import LinearRegression

g = df3.dropna()
X = g[['month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']]
y = g[['value']]

clf = LinearRegression()
clf.fit(X,y)


g["pred_lin_regr"] = clf.predict(X) 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [132]:
g

Unnamed: 0,Year,Month,value,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,pred_lin_regr
0,2000,1,56.5,1,0,0,0,0,0,0,0,0,0,0,0,54.0000
1,2001,1,49.3,1,0,0,0,0,0,0,0,0,0,0,0,54.0000
2,2002,1,54.5,1,0,0,0,0,0,0,0,0,0,0,0,54.0000
3,2003,1,50.1,1,0,0,0,0,0,0,0,0,0,0,0,54.0000
4,2004,1,54.7,1,0,0,0,0,0,0,0,0,0,0,0,54.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2015,12,59.4,0,0,0,0,0,0,0,0,0,0,0,1,54.6875
247,2016,12,58.6,0,0,0,0,0,0,0,0,0,0,0,1,54.6875
248,2017,12,53.7,0,0,0,0,0,0,0,0,0,0,0,1,54.6875
249,2018,12,55.6,0,0,0,0,0,0,0,0,0,0,0,1,54.6875


In [139]:
dum = pd.get_dummies(val["month"])

In [242]:
dum["pred"] = clf.predict(dum)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 12 is different from 13)

In [144]:
val["dum"] = dum["pred"]

In [145]:
val

Unnamed: 0,month,pred,mean,mean+1,overall_mean,dum
0,1,66.528057,53.47619,,70.502811,54.0
1,2,67.257934,57.314286,53.47619,70.502811,58.4375
2,3,67.987812,63.985714,57.314286,70.502811,63.6875
3,4,68.717689,70.180952,63.985714,70.502811,70.5625
4,5,69.447567,77.314286,70.180952,70.502811,77.25
5,6,70.177444,82.97619,77.314286,70.502811,82.625
6,7,70.907322,84.804762,82.97619,70.502811,84.4375
7,8,71.637199,85.333333,84.804762,70.502811,85.3125
8,9,72.367077,80.561905,85.333333,70.502811,80.1875
9,10,73.096954,71.93,80.561905,70.502811,71.8125


In [149]:
df2

Unnamed: 0,month,variable,value
0,1,pred,66.528057
1,2,pred,67.257934
2,3,pred,67.987812
3,4,pred,68.717689
4,5,pred,69.447567
5,6,pred,70.177444
6,7,pred,70.907322
7,8,pred,71.637199
8,9,pred,72.367077
9,10,pred,73.096954


In [150]:
fig = px.scatter(df1, x="Month", y="value")
fig.show()

In [None]:
from sklearn.metrics import mean_squared_error



In [177]:
df4 = df2.pivot_table(index=["month"], 
                    columns='variable', 
                    values='value')

In [195]:
df4

variable,dum,mean,mean+1,overall_mean,pred
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,54.0,53.47619,54.99,70.502811,66.528057
2,58.4375,57.314286,53.47619,70.502811,67.257934
3,63.6875,63.985714,57.314286,70.502811,67.987812
4,70.5625,70.180952,63.985714,70.502811,68.717689
5,77.25,77.314286,70.180952,70.502811,69.447567
6,82.625,82.97619,77.314286,70.502811,70.177444
7,84.4375,84.804762,82.97619,70.502811,70.907322
8,85.3125,85.333333,84.804762,70.502811,71.637199
9,80.1875,80.561905,85.333333,70.502811,72.367077
10,71.8125,71.93,80.561905,70.502811,73.096954


In [196]:
from sklearn.metrics import mean_squared_error

lit = []

for i in list(df4.columns):
    try: 
        lit.append([mean_squared_error(df4["mean"], df4[i])])
    except:
        break

In [204]:
a = list(df4.columns)

In [236]:
"'lightgrey', "*19

"'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', 'lightgrey', "