In [52]:
import numpy as np
import pandas as pd
import math
import json

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn import preprocessing
import pickle

## Read in the data and remove bad values

In [53]:
df = pd.read_csv('final_ML_data.csv')
filters = (df['Windspeed'] != '#REF!') & (df['Humidity'] != '#REF!') & (df['Pressure3pm'].isnull() == False) # eliminating nonsense rows
df = df[filters].reset_index(drop=True)
df["confidence"] = df["confidence"].fillna(0)

  interactivity=interactivity, compiler=compiler, result=result)


### Get lat long values from bounding box averaging

In [54]:
coordinates = {}
with open("../../geocoding/bounding_boxes.json") as f:
    coordinates = json.load(f)

df["lat"] = None
df["long"] = None

for k, v in coordinates.items():
    c = coordinates[k]
    df.loc[df["Location"] == k, ["lat",]] = np.mean([c[0], c[1]])
    df.loc[df["Location"] == k, ["long",]] = np.mean([c[2], c[3]])

df


Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,AvgTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,...,Temp3pm,acq_date,count,latitude,longitude,brightness,bright_t31,confidence,lat,long
0,0.0,1/01/2015,Albury,11.4,33.5,22.45,0.0,WSW,30.0,ESE,...,32.7,,,,,,,0.0,-36.0805,146.916
1,1.0,2/01/2015,Albury,15.5,39.6,27.55,0.0,NE,56.0,ESE,...,38.2,,,,,,,0.0,-36.0805,146.916
2,2.0,3/01/2015,Albury,17.1,38.3,27.7,0.0,NNE,48.0,NE,...,37.0,,,,,,,0.0,-36.0805,146.916
3,3.0,4/01/2015,Albury,26.0,33.1,29.55,0.0,NNE,41.0,ESE,...,30.9,,,,,,,0.0,-36.0805,146.916
4,4.0,5/01/2015,Albury,19.0,35.2,27.1,0.0,E,33.0,SSE,...,32.5,,,,,,,0.0,-36.0805,146.916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36678,38528.0,26/12/2016,Hobart,9.1,17.4,11.35,0.0,SW,35.0,E,...,37.3,,,,,,,0.0,-42.8823,147.329
36679,38529.0,27/12/2016,Hobart,7.4,21.2,12.9,0.0,SW,35.0,ENE,...,31.5,,,,,,,0.0,-42.8823,147.329
36680,38530.0,28/12/2016,Hobart,0.6,16.5,9.2,0.0,S,41.0,N,...,32.6,,,,,,,0.0,-42.8823,147.329
36681,38531.0,29/12/2016,Hobart,2.6,17.7,13.25,10.4,SE,33.0,SSE,...,33.4,,,,,,,0.0,-42.8823,147.329


### Sample 25% of the data cause we don't want to bias the data to only pick 0's

In [55]:
df_non_zero = df[df["confidence"] > 0]
# df_zero = df[df["confidence"] == 0].sample(n=len(df_non_zero.index))
df_zero = df[df["confidence"] == 0].sample(frac=.25, random_state=142)

df_final = pd.concat([df_non_zero, df_zero]).sample(frac=.75, random_state=192)
df_final
# df_final = df

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,AvgTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,...,Temp3pm,acq_date,count,latitude,longitude,brightness,bright_t31,confidence,lat,long
28564,29698.0,16/05/2017,MountGambier,10.3,33.1,16.7,0.0,E,56.0,E,...,12.8,,,,,,,0.0,-37.8247,140.782
35300,37148.0,15/08/2015,Walpole,16.2,26.4,20.25,0.0,NW,46.0,ESE,...,17.2,,,,,,,0.0,-34.9777,116.731
3586,3720.0,15/08/2015,Moree,1.7,25.7,13.7,0.0,SSE,63.0,ESE,...,18.8,15/08/2015,1.0,-29.1545,149.3511,316.0,302.3,71.0,-29.3053,149.777
8933,9831.0,8/05/2015,SydneyAirport,12.4,16.1,16.3,0.0,WSW,33.0,ENE,...,21.5,,,,,,,0.0,-33.95,151.179
10397,11297.0,26/11/2016,WaggaWagga,8.5,23.4,21.8,0.0,NE,50.0,W,...,14.3,,,,,,,0.0,-35.1859,147.355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15651,16563.0,23/11/2016,Ballarat,12.3,18.4,17.5,0.0,SSE,24.0,W,...,22.5,,,,,,,0.0,-37.5623,143.861
14661,15573.0,29/08/2016,MountGinini,2.3,15.8,6.75,0.0,SE,57.0,ENE,...,31.8,,,,,,,0.0,-35.5297,148.773
1823,1833.0,19/02/2015,Cobar,21.3,34.5,27.9,0.0,SSW,37.0,N,...,31.7,,,,,,,0.0,-31.4842,145.795
20008,20937.0,1/05/2015,Nhil,9.6,10.9,8.3,0.0,SE,46.0,S,...,32.0,,,,,,,0.0,-35.471,141.306


## Pick what columns we need

In [56]:
cols_to_pick = ["AvgTemp", "Rainfall", 'Windspeed', 'Humidity', 'Pressure', 'lat', 'long']
all_X = df_final.loc[:, cols_to_pick] # not accounting for wind directions right now
all_Y = df_final.iloc[:, 28] # confidence data

## Pre-process the data

In [57]:
x = all_X.values #returns a numpy array
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(x)
all_X = pd.DataFrame(x_scaled)
all_X

Unnamed: 0,0,1,2,3,4,5,6
0,-0.154226,-0.280970,0.109066,0.815123,0.783057,-0.928562,-0.174714
1,0.452068,-0.280970,-1.652785,-0.287411,-1.428606,-0.359241,-2.218498
2,-0.666588,-0.280970,-0.934994,-1.217673,-0.557574,0.775085,0.589697
3,-0.222541,-0.280970,0.892111,-0.080686,0.540683,-0.153735,0.708771
4,0.716788,-0.280970,-1.000248,-0.942040,-1.012025,-0.400873,0.383867
...,...,...,...,...,...,...,...
7019,-0.017597,-0.280970,0.892111,0.711760,1.161766,-0.876085,0.086922
7020,-1.853558,-0.280970,-0.412964,0.987394,-0.398516,-0.469633,0.504324
7021,1.758589,-0.280970,-0.478218,-2.699203,2.343340,0.339363,0.251254
7022,-1.588838,-0.280970,0.696349,1.814294,0.904244,-0.457900,-0.130195


In [58]:
### Make Y be a binary value

In [59]:
all_Y = all_Y > 0
print(np.count_nonzero(np.array(all_Y) > 0), len(all_Y))

198 7024


In [60]:
### Split and train the model

In [61]:
X_train, X_test, y_train, y_test = train_test_split(all_X, all_Y, random_state=11)

# clf = ElasticNet(random_state=0, solver='liblinear', multi_class='ovr')
clf = ExtraTreesClassifier(random_state=0, n_estimators=1000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # this is a surprisingly low score but from what I've seen we shouldn't put too much faith in them
                          # i.e. we shouldn't be too worried

0.9732346241457859

In [62]:
### Score the model on the entire DB (75% of it is untrained on)

In [63]:
all_X_fullsample = df.loc[:, cols_to_pick] # not accounting for wind directions right now
all_Y_fullsample = df.iloc[:, 28] > 0

x_fullsample = all_X_fullsample.values #returns a numpy array
x_scaled_fullsample = min_max_scaler.fit_transform(x_fullsample)
all_X_fullsample = pd.DataFrame(x_scaled_fullsample)
all_X_fullsample

clf.score(all_X_fullsample, all_Y_fullsample)

0.9962653000027261

In [64]:
clf.predict(df.loc[700:1000, cols_to_pick])

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [65]:
df.loc[0, cols_to_pick]

AvgTemp        22.45
Rainfall           0
Windspeed          9
Humidity        29.5
Pressure     1012.25
lat         -36.0805
long         146.916
Name: 0, dtype: object

In [75]:
a = clf.predict(df.loc[:, cols_to_pick])
a[1:1000]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [68]:
if True in a:
    print(True)
else:
    print("hi")

hi


### Save the model into a pickle

In [14]:
# pickle.dump(clf, open("../pickles/binary_extra_trees_2.sav", 'wb'))