In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

## Read in the data and remove bad values

In [2]:
df = pd.read_csv('final_ML_data.csv')
filters = (df['Windspeed'] != '#REF!') & (df['Humidity'] != '#REF!') & (df['Pressure3pm'].isnull() == False) # eliminating nonsense rows
df_final = df[filters].reset_index(drop=True)

In [3]:
df_final

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,AvgTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure,Temp9am,Temp3pm,acq_date,count,latitude,longitude,brightness,bright_t31,confidence
0,0.0,1/01/2015,Albury,11.4,33.5,22.45,0.0,WSW,30.0,ESE,...,1012.25,21.0,32.7,,,,,,,
1,1.0,2/01/2015,Albury,15.5,39.6,27.55,0.0,NE,56.0,ESE,...,1014.20,25.6,38.2,,,,,,,
2,2.0,3/01/2015,Albury,17.1,38.3,27.7,0.0,NNE,48.0,NE,...,1015.10,29.2,37.0,,,,,,,
3,3.0,4/01/2015,Albury,26.0,33.1,29.55,0.0,NNE,41.0,ESE,...,1012.85,27.4,30.9,,,,,,,
4,4.0,5/01/2015,Albury,19.0,35.2,27.1,0.0,E,33.0,SSE,...,1016.05,25.6,32.5,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36678,38528.0,26/12/2016,Hobart,9.1,17.4,11.35,0.0,SW,35.0,E,...,1023.50,10.8,37.3,,,,,,,
36679,38529.0,27/12/2016,Hobart,7.4,21.2,12.9,0.0,SW,35.0,ENE,...,1024.20,11.4,31.5,,,,,,,
36680,38530.0,28/12/2016,Hobart,0.6,16.5,9.2,0.0,S,41.0,N,...,1023.15,14.2,32.6,,,,,,,
36681,38531.0,29/12/2016,Hobart,2.6,17.7,13.25,10.4,SE,33.0,SSE,...,1020.75,17.2,33.4,,,,,,,


## Pick what columns we need

In [4]:
cols_to_pick = [5,6,13,16,19]
all_X = df_final.iloc[:, cols_to_pick] # not accounting for wind directions right now
all_Y = df_final.iloc[:, 28] # confidence data

## Pre-process the data

In [5]:
x = all_X.values #returns a numpy array
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(x)
all_X = pd.DataFrame(x_scaled)
all_X

Unnamed: 0,0,1,2,3,4
0,0.829868,-0.277960,-0.947168,-2.186020,-0.690328
1,1.706641,-0.277960,-0.947168,-2.254907,-0.392397
2,1.732429,-0.277960,0.501373,-2.358239,-0.254891
3,2.050474,-0.277960,-1.210539,-1.359368,-0.598657
4,1.629279,-0.277960,-1.078854,-0.980486,-0.109745
...,...,...,...,...,...
36678,-1.078404,-0.277960,0.172159,-0.808267,1.028502
36679,-0.811933,-0.277960,-0.486269,-0.601604,1.135452
36680,-1.448024,-0.277960,-1.605596,0.121716,0.975028
36681,-0.751763,1.002858,-0.552111,-0.153834,0.608344


In [6]:
for i in enumerate(all_Y): # setting null confidence values to 0
    if math.isnan(i[1]):
        all_Y[i[0]] = 0

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
36678,0.0
36679,0.0
36680,0.0
36681,0.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(all_X, all_Y)

clf = LinearRegression() 
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # this is a surprisingly low score but from what I've seen we shouldn't put too much faith in them
                          # i.e. we shouldn't be too worried

0.0007509138822485717

In [31]:
clf.predict(df.iloc[0:80, cols_to_pick])

array([[3.56903626],
       [3.59532146],
       [3.56280801],
       [3.51560953],
       [3.48011298],
       [3.52666708],
       [3.49369973],
       [3.38968013],
       [3.21527959],
       [3.13102455],
       [3.36681286],
       [3.47131062],
       [3.22557548],
       [3.35889029],
       [3.35268117],
       [3.41273447],
       [3.40077815],
       [3.48368922],
       [3.49526584],
       [3.43003673],
       [3.37528021],
       [3.43232559],
       [3.43821198],
       [3.40334648],
       [3.42154259],
       [3.46672534],
       [3.40955319],
       [3.41729899],
       [3.47729607],
       [3.43880293],
       [3.44396548],
       [3.41840787],
       [3.46282841],
       [3.48418028],
       [3.48566453],
       [3.47195108],
       [3.48570806],
       [3.51769332],
       [3.49164968],
       [3.51089827],
       [3.49974819],
       [3.48559649],
       [3.46259349],
       [3.47015709],
       [3.2546149 ],
       [3.38292914],
       [3.44054096],
       [3.445