In [1]:
import pandas as pd
raw_data = pd.read_csv("C:/Users/Kaden/a_weather_project/weatherdata_grabber/weather_data.csv")

In [2]:
raw_data['Datetime'] = (raw_data['Date'] + ' ' + raw_data['Time'])
raw_data['Datetime'] = pd.to_datetime(raw_data['Datetime'])

In [3]:
weather_data = raw_data[['Datetime','Temperature']]
weather_data = weather_data.drop_duplicates()

In [4]:
from datetime import timedelta

start_d = weather_data.Datetime.max()
end_d = weather_data.Datetime.min()

dates = pd.Series(pd.date_range(start=start_d, end=end_d, freq='-1H'))
missing_dates = dates.loc[~dates.isin(weather_data.Datetime)]

h = timedelta(hours=1)
missing_rows = []

for date in missing_dates:
    prev_date = date + h
    next_date = date - h
    while(weather_data.loc[weather_data['Datetime'] == prev_date].empty):
        prev_date = prev_date + h
    row_dict = {}
    row_dict = weather_data.loc[weather_data['Datetime'] == prev_date].to_dict('records')
    row_dict[0].update(Datetime=date)
    missing_rows.append(row_dict[0])

weather_data = weather_data.append(missing_rows)

In [49]:
weather_data = weather_data.sort_values('Datetime', ascending=True)
weather_data = weather_data.drop_duplicates()
weather_data.shape

(19750, 10)

In [6]:
weather_data['Day'] = weather_data.Datetime.dt.day
weather_data['Month'] = weather_data.Datetime.dt.month
weather_data['Hour'] = weather_data.Datetime.dt.hour
weather_data['DayofYear'] = weather_data.Datetime.dt.dayofyear
weather_data['WeekofYear'] = weather_data.Datetime.dt.weekofyear
weather_data['Year'] = weather_data.Datetime.dt.year

In [7]:
from sklearn.model_selection import train_test_split


weather_data['Prev_Temp'] = weather_data['Temperature'].shift()
weather_data['Prev_Temp_Diff'] = weather_data.loc[:,'Prev_Temp'].diff()
weather_data = weather_data.dropna()

features = ['Hour','DayofYear','Month', 'Day','Year','Prev_Temp','Prev_Temp_Diff']
X = pd.get_dummies(weather_data[features])
y = weather_data.Temperature

train_X,test_X,train_y, test_y = train_test_split(X,y,test_size=.2, random_state=23, shuffle=False)

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


def model_eval(model, train_X, train_y, test_X, test_y):
    model.fit(train_X,train_y)
    predictions = model.predict(test_X)
    
    print("mean squared error %.5f" % (mean_squared_error(test_y,predictions)))
    print("mean absolute error %.5f" %(mean_absolute_error(test_y,predictions)))
    return predictions

In [9]:
model = RandomForestClassifier(n_estimators=555, n_jobs=-1, random_state=0)
p1 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 17.64481
mean absolute error 2.95367


In [10]:
model = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=0)
p4 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 8.94347
mean absolute error 2.10390


In [11]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimatorts=1000,learning_rate=0.1)
# model
p3 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 7.82083
mean absolute error 1.98407


In [12]:
model = LGBMRegressor(learning_rate=0.07, num_leaves=30)
p3 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 7.46213
mean absolute error 1.90615


In [13]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_X)
trainX = scaler.transform(train_X)
testX = scaler.transform(test_X)

model = SGDRegressor()
p4 = model_eval(model,trainX,train_y,testX,test_y)

mean squared error 11.86167
mean absolute error 2.39909


In [48]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.1, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, 
              tol=0.0001, warm_start=False, positive=False, random_state=26, selection='random')
p5 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 11.75175
mean absolute error 2.39753


In [29]:
from sklearn.linear_model import Ridge

model = Ridge()
p5 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 11.78900
mean absolute error 2.40567


In [16]:
from sklearn.linear_model import ElasticNet

model = ElasticNet()
p5 = model_eval(model,train_X,train_y,test_X,test_y)

mean squared error 11.78622
mean absolute error 2.42060


In [17]:
final_model = LGBMRegressor(learning_rate=0.07, num_leaves=30)
final_model.fit(X,y)

one_hour = timedelta(hours=1)
to_predict = pd.DataFrame()
temp = weather_data[['Temperature','Datetime']].tail(1)
temp['Datetime'] = temp['Datetime'] + one_hour
temp.reset_index(inplace=True,drop=True)

to_predict['Hour'] = temp['Datetime'].dt.hour
to_predict['DayofYear'] = temp['Datetime'].dt.dayofyear
to_predict['Month'] = temp['Datetime'].dt.month
to_predict['Day'] = temp['Datetime'].dt.day
to_predict['Year'] = temp['Datetime'].dt.year

to_predict['Prev_Temp'] = temp['Temperature']
to_predict['Prev_Temp_Diff'] = weather_data['Temperature'].values[-1] - weather_data['Temperature'].values[-2]

tom_temps = final_model.predict(to_predict)

for i in range(1,24):
    to_predict.loc[i] = to_predict.loc[i-1]
    to_predict.loc[i].Hour = i
    to_predict.loc[i].Prev_Temp = round(tom_temps.item(i-1))
    to_predict.loc[i].Prev_Temp_Diff = to_predict.loc[i].Prev_Temp - to_predict.loc[i-1].Prev_Temp
    tom_temps = final_model.predict(to_predict)

predicted_temps = pd.DataFrame()
predicted_temps['Datetime'] = weather_data['Datetime'].tail(24) + timedelta(days=1)
predicted_temps['Predicted_Temp'] = tom_temps
predicted_temps

Unnamed: 0,Datetime,Predicted_Temp
19653,2020-07-03 00:53:00,70.615279
19654,2020-07-03 01:53:00,69.360883
19655,2020-07-03 02:53:00,67.346504
19656,2020-07-03 03:53:00,65.609156
19657,2020-07-03 04:53:00,65.033893
19658,2020-07-03 05:53:00,63.953582
19659,2020-07-03 06:53:00,66.289754
19660,2020-07-03 07:53:00,69.899588
19661,2020-07-03 08:53:00,74.311514
19662,2020-07-03 09:53:00,77.762385
