In [1]:
import pandas as pd
raw_data = pd.read_csv("C:/Users/Kaden/a_weather_project/weatherdata_grabber/weather_data.csv")

In [2]:
raw_data['Datetime'] = (raw_data['Date'] + ' ' + raw_data['Time'])
raw_data['Datetime'] = pd.to_datetime(raw_data['Datetime'])

In [3]:
weather_data = raw_data[['Datetime','Temperature']]
weather_data = weather_data.drop_duplicates()

In [4]:
from datetime import timedelta

start_d = weather_data.Datetime.max()
end_d = weather_data.Datetime.min()

dates = pd.Series(pd.date_range(start=start_d, end=end_d, freq='-1H'))
missing_dates = dates.loc[~dates.isin(weather_data.Datetime)]

h = timedelta(hours=1)
missing_rows = []

for date in missing_dates:
    prev_date = date + h
    next_date = date - h
    while(weather_data.loc[weather_data['Datetime'] == prev_date].empty):
        prev_date = prev_date + h
    row_dict = {}
    row_dict = weather_data.loc[weather_data['Datetime'] == prev_date].to_dict('records')
    row_dict[0].update(Datetime=date)
    missing_rows.append(row_dict[0])

weather_data = weather_data.append(missing_rows)

In [5]:
weather_data = weather_data.sort_values('Datetime', ascending=True)
weather_data = weather_data.drop_duplicates()
weather_data.shape

(19848, 2)

In [6]:
weather_data['Day'] = weather_data.Datetime.dt.day
weather_data['Month'] = weather_data.Datetime.dt.month
weather_data['Hour'] = weather_data.Datetime.dt.hour
weather_data['DayofYear'] = weather_data.Datetime.dt.dayofyear
weather_data['WeekofYear'] = weather_data.Datetime.dt.weekofyear
weather_data['Year'] = weather_data.Datetime.dt.year

In [7]:
from sklearn.model_selection import train_test_split


weather_data['Prev_Temp'] = weather_data['Temperature'].shift()
weather_data['Prev_Temp_Diff'] = weather_data.loc[:,'Prev_Temp'].diff()
weather_data = weather_data.dropna()

features = ['Hour','DayofYear','Month', 'Day','Year','Prev_Temp','Prev_Temp_Diff']
X = pd.get_dummies(weather_data[features])
y = weather_data.Temperature

train_X,test_X,train_y, test_y = train_test_split(X,y,test_size=.2, random_state=23, shuffle=False)

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


# in this case it would be better to consider the RMSE value
# so I will chose the model with the lowest RMSE
def model_eval(model, train_X, train_y, test_X, test_y):
    model.fit(train_X,train_y)
    predictions = model.predict(test_X)
    
    print("mean squared error %.5f" % (mean_squared_error(test_y,predictions)))
    print("mean absolute error %.5f" %(mean_absolute_error(test_y,predictions)))
    print("root mean squared error %.5f" %(sqrt(mean_squared_error(test_y,predictions))))
    return predictions

In [9]:
model = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=0)
p4 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 8.46012
mean absolute error 2.03353
root mean squared error 2.90863


In [10]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimatorts=1000,learning_rate=0.1)
p3 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 7.38865
mean absolute error 1.89531
root mean squared error 2.71821


In [11]:
model = LGBMRegressor(learning_rate=0.07, num_leaves=30)
p3 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 7.51447
mean absolute error 1.91563
root mean squared error 2.74125


In [12]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_X)
trainX = scaler.transform(train_X)
testX = scaler.transform(test_X)

model = SGDRegressor()
p4 = model_eval(model,trainX,train_y,testX,test_y)

mean squared error 11.90373
mean absolute error 2.40167
root mean squared error 3.45018


In [13]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.1, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, 
              tol=0.0001, warm_start=False, positive=False, random_state=26, selection='random')
p5 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 11.88383
mean absolute error 2.41373
root mean squared error 3.44729


In [14]:
from sklearn.linear_model import Ridge

model = Ridge()
p5 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 11.92462
mean absolute error 2.42212
root mean squared error 3.45320


In [15]:
from sklearn.linear_model import ElasticNet

model = ElasticNet()
p5 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 11.92010
mean absolute error 2.43672
root mean squared error 3.45255


In [18]:
final_model = LGBMRegressor(n_estimatorts=1000,learning_rate=0.1) #model wiht lowest RMSE
final_model.fit(X,y)

one_hour = timedelta(hours=1)
to_predict = pd.DataFrame()
temp = weather_data[['Temperature','Datetime']].tail(1)
temp['Datetime'] = temp['Datetime'] + one_hour
temp.reset_index(inplace=True,drop=True)

to_predict['Hour'] = temp['Datetime'].dt.hour
to_predict['DayofYear'] = temp['Datetime'].dt.dayofyear
to_predict['Month'] = temp['Datetime'].dt.month
to_predict['Day'] = temp['Datetime'].dt.day
to_predict['Year'] = temp['Datetime'].dt.year

to_predict['Prev_Temp'] = temp['Temperature']
to_predict['Prev_Temp_Diff'] = weather_data['Temperature'].values[-1] - weather_data['Temperature'].values[-2]

tom_temps = final_model.predict(to_predict)

for i in range(1,24):
    to_predict.loc[i] = to_predict.loc[i-1]
    to_predict.loc[i].Hour = i
    to_predict.loc[i].Prev_Temp = round(tom_temps.item(i-1))
    to_predict.loc[i].Prev_Temp_Diff = to_predict.loc[i].Prev_Temp - to_predict.loc[i-1].Prev_Temp
    tom_temps = final_model.predict(to_predict)

predicted_temps = pd.DataFrame()
predicted_temps['Predicted_Temp'] = tom_temps
# index represents hour
predicted_temps

Unnamed: 0,Predicted_Temp
0,75.827464
1,71.977378
2,70.63625
3,69.424185
4,67.459637
5,65.844344
6,67.295793
7,70.190242
8,74.140618
9,77.904876
