In [75]:
import numpy as np
import pandas as pd
import math
import json
import datetime

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn import preprocessing

## Read in the data and remove bad values

In [76]:
df = pd.read_csv('final_ML_data.csv')
filters = (df['Windspeed'] != '#REF!') & (df['Humidity'] != '#REF!') & (df['Pressure3pm'].isnull() == False) # eliminating nonsense rows
df = df[filters].reset_index(drop=True)
df["confidence"] = df["confidence"].fillna(0)

  interactivity=interactivity, compiler=compiler, result=result)


### Get lat long values from bounding box averaging

In [77]:
coordinates = {}
with open("../../geocoding/bounding_boxes.json") as f:
    coordinates = json.load(f)

df["lat"] = None
df["long"] = None

for k, v in coordinates.items():
    c = coordinates[k]
    df.loc[df["Location"] == k, ["lat",]] = np.mean([c[0], c[1]])
    df.loc[df["Location"] == k, ["long",]] = np.mean([c[2], c[3]])

df


Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,AvgTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,...,Temp3pm,acq_date,count,latitude,longitude,brightness,bright_t31,confidence,lat,long
0,0.0,1/01/2015,Albury,11.4,33.5,22.45,0.0,WSW,30.0,ESE,...,32.7,,,,,,,0.0,-36.0805,146.916
1,1.0,2/01/2015,Albury,15.5,39.6,27.55,0.0,NE,56.0,ESE,...,38.2,,,,,,,0.0,-36.0805,146.916
2,2.0,3/01/2015,Albury,17.1,38.3,27.7,0.0,NNE,48.0,NE,...,37.0,,,,,,,0.0,-36.0805,146.916
3,3.0,4/01/2015,Albury,26.0,33.1,29.55,0.0,NNE,41.0,ESE,...,30.9,,,,,,,0.0,-36.0805,146.916
4,4.0,5/01/2015,Albury,19.0,35.2,27.1,0.0,E,33.0,SSE,...,32.5,,,,,,,0.0,-36.0805,146.916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36678,38528.0,26/12/2016,Hobart,9.1,17.4,11.35,0.0,SW,35.0,E,...,37.3,,,,,,,0.0,-42.8823,147.329
36679,38529.0,27/12/2016,Hobart,7.4,21.2,12.9,0.0,SW,35.0,ENE,...,31.5,,,,,,,0.0,-42.8823,147.329
36680,38530.0,28/12/2016,Hobart,0.6,16.5,9.2,0.0,S,41.0,N,...,32.6,,,,,,,0.0,-42.8823,147.329
36681,38531.0,29/12/2016,Hobart,2.6,17.7,13.25,10.4,SE,33.0,SSE,...,33.4,,,,,,,0.0,-42.8823,147.329


### Sample 25% of the data cause we don't want to bias the data to only pick 0's

In [78]:
df_non_zero = df[df["confidence"] > 0]
# df_zero = df[df["confidence"] == 0].sample(n=len(df_non_zero.index))
df_zero = df[df["confidence"] == 0].sample(frac=.25, random_state=142)

df_final = pd.concat([df_non_zero, df_zero]).sample(frac=.75, random_state=192)
df_final
# df_final = df

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,AvgTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,...,Temp3pm,acq_date,count,latitude,longitude,brightness,bright_t31,confidence,lat,long
28564,29698.0,16/05/2017,MountGambier,10.3,33.1,16.7,0.0,E,56.0,E,...,12.8,,,,,,,0.0,-37.8247,140.782
35300,37148.0,15/08/2015,Walpole,16.2,26.4,20.25,0.0,NW,46.0,ESE,...,17.2,,,,,,,0.0,-34.9777,116.731
3586,3720.0,15/08/2015,Moree,1.7,25.7,13.7,0.0,SSE,63.0,ESE,...,18.8,15/08/2015,1.0,-29.1545,149.3511,316.0,302.3,71.0,-29.3053,149.777
8933,9831.0,8/05/2015,SydneyAirport,12.4,16.1,16.3,0.0,WSW,33.0,ENE,...,21.5,,,,,,,0.0,-33.95,151.179
10397,11297.0,26/11/2016,WaggaWagga,8.5,23.4,21.8,0.0,NE,50.0,W,...,14.3,,,,,,,0.0,-35.1859,147.355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15651,16563.0,23/11/2016,Ballarat,12.3,18.4,17.5,0.0,SSE,24.0,W,...,22.5,,,,,,,0.0,-37.5623,143.861
14661,15573.0,29/08/2016,MountGinini,2.3,15.8,6.75,0.0,SE,57.0,ENE,...,31.8,,,,,,,0.0,-35.5297,148.773
1823,1833.0,19/02/2015,Cobar,21.3,34.5,27.9,0.0,SSW,37.0,N,...,31.7,,,,,,,0.0,-31.4842,145.795
20008,20937.0,1/05/2015,Nhil,9.6,10.9,8.3,0.0,SE,46.0,S,...,32.0,,,,,,,0.0,-35.471,141.306


## Pick what columns we need

In [79]:
cols_to_pick = ["MinTemp", "MaxTemp", "AvgTemp", "Rainfall", 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Windspeed',  
'Humidity9am', 'Humidity3pm', 'Humidity', 'Pressure9am', 'Pressure3pm', 'Pressure', 'Temp9am', 'Temp3pm', 'lat', 'long']
all_X = df_final.loc[:, cols_to_pick] # not accounting for wind directions right now
all_Y = df_final.iloc[:, 28] # confidence data

## Pre-process the data

In [80]:
x = all_X.values #returns a numpy array
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(x)
all_X = pd.DataFrame(x_scaled)
all_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-0.279412,1.404578,-0.154226,-0.280970,1.259575,0.371448,-0.600994,0.109066,-0.632516,-1.360436,0.815123,1.112950,0.320572,0.783057,-0.233633,-1.266836,-0.928562,-0.174714
1,0.667040,0.465026,0.452068,-0.280970,0.513425,0.719910,0.073941,-1.652785,-0.357134,0.597650,-0.287411,-1.162224,-1.451087,-1.428606,0.019663,-0.634038,-0.359241,-2.218498
2,-1.658986,0.366864,-0.666588,-0.280970,1.781881,0.603756,-0.151037,-0.934994,-1.238356,-1.703101,-1.217673,-0.413082,-0.606781,-0.557574,-0.455267,-0.403930,0.775085,0.589697
3,0.057461,-0.979359,-0.222541,-0.280970,-0.456571,0.139140,0.636387,0.892111,-1.018051,-0.968819,-0.080686,0.655140,0.334413,0.540683,-0.993520,-0.015623,-0.153735,0.708771
4,-0.568160,0.044331,0.716788,-0.280970,0.811885,-0.906245,1.648789,-1.000248,1.350234,0.891362,-0.942040,-0.773780,-1.077378,-1.012025,1.460282,-1.051110,-0.400873,0.383867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019,0.041419,-0.656826,-0.017597,-0.280970,-1.128107,-0.557784,-1.275928,0.892111,-0.742669,0.206032,0.711760,0.946474,1.178719,1.161766,-0.471098,0.128195,-0.876085,0.086922
7020,-1.562737,-1.021428,-1.853558,-0.280970,1.334190,0.719910,0.411408,-0.412964,-0.742669,2.359927,0.987394,-0.343716,-0.385324,-0.398516,-2.196675,1.465699,-0.469633,0.504324
7021,1.485160,1.600902,1.758589,-0.280970,-0.158111,0.603756,-1.388417,-0.478218,-2.835572,-1.311484,-2.699203,2.167298,2.119913,2.343340,1.270310,1.451317,0.339363,0.251254
7022,-0.391703,-1.708563,-1.588838,-0.280970,0.513425,0.139140,1.648789,0.696349,-1.458662,0.793458,1.814294,1.709489,-0.053138,0.904244,-1.658422,1.494463,-0.457900,-0.130195


In [81]:
### Make Y be a binary value

In [82]:
all_Y = all_Y > 0
print(np.count_nonzero(np.array(all_Y) > 0), len(all_Y))

198 7024


In [83]:
### Split and train the model

In [84]:
X_train, X_test, y_train, y_test = train_test_split(all_X, all_Y, random_state=11)

# clf = ElasticNet(random_state=0, solver='liblinear', multi_class='ovr')
clf = ExtraTreesClassifier(random_state=0, n_estimators=1000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # this is a surprisingly low score but from what I've seen we shouldn't put too much faith in them
                          # i.e. we shouldn't be too worried

0.9720956719817767

In [85]:
### Score the model on the entire DB (75% of it is untrained on)

In [86]:
all_X_fullsample = df.loc[:, cols_to_pick] # not accounting for wind directions right now
all_Y_fullsample = df.iloc[:, 28] > 0

x_fullsample = all_X_fullsample.values #returns a numpy array
x_scaled_fullsample = min_max_scaler.fit_transform(x_fullsample)
all_X_fullsample = pd.DataFrame(x_scaled_fullsample)
all_X_fullsample

clf.score(all_X_fullsample, all_Y_fullsample)

0.9969195540168471

In [87]:
clf.predict(df.loc[0:80, cols_to_pick])

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

**Regression on non-zero confidence rows**

In [118]:
df_non_zero['Month'] = pd.DatetimeIndex(df_non_zero['Date']).month

cols_to_pick_reg = ["AvgTemp", "Rainfall", 'WindGustSpeed', 'Windspeed', 
                    'Humidity', 'Pressure', 'lat', 'long', 'Month']
#cols_to_pick_reg = ["MinTemp", "MaxTemp", "AvgTemp", "Rainfall", 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Windspeed',  
#'Humidity9am', 'Humidity3pm', 'Humidity', 'Pressure9am', 'Pressure3pm', 'Pressure', 'Temp9am', 'Temp3pm', 'lat', 'long', 'Month']
all_X_reg = df_non_zero.loc[:, cols_to_pick_reg] # not accounting for wind directions right now
all_Y_reg = df_non_zero.iloc[:, 28] # confidence data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [119]:
all_X_reg

Unnamed: 0,AvgTemp,Rainfall,WindGustSpeed,Windspeed,Humidity,Pressure,lat,long,Month
73,20.5,0.0,37.0,15.5,34,1011.30,-36.0805,146.916,3
88,18.4,0.0,26.0,9,48,1023.55,-36.0805,146.916,3
319,19.15,0.0,24.0,9,47.5,1018.10,-36.0805,146.916,11
376,29.75,0.0,56.0,15,29,1010.30,-36.0805,146.916,12
480,15.5,0.0,15.0,5.5,52,1030.30,-36.0805,146.916,4
...,...,...,...,...,...,...,...,...,...
33273,18.5,0.0,37.0,17.5,27.5,1029.40,-31.9527,115.86,3
33280,17.8,0.0,46.0,19,47.5,1026.00,-31.9527,115.86,10
33648,15.75,0.0,39.0,17,70,1020.75,-31.9527,115.86,1
34618,14.8,0.0,43.0,19.5,57.5,1011.35,-32.9815,121.644,3


In [122]:
x_reg = all_X_reg.values #returns a numpy array
min_max_scaler_reg = preprocessing.StandardScaler()
x_scaled_reg = min_max_scaler_reg.fit_transform(x_reg)
all_X_reg = pd.DataFrame(x_scaled_reg)
all_X_reg

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.315828,-0.304572,-0.262898,0.007357,-1.562529,-1.032138,-0.991083,0.058991,-0.942207
1,0.004126,-0.304572,-1.151386,-1.069713,-0.644333,0.928535,-0.991083,0.058991,-0.942207
2,0.115448,-0.304572,-1.312930,-1.069713,-0.677125,0.056236,-0.991083,0.058991,1.409904
3,1.688800,-0.304572,1.271764,-0.075494,-1.890456,-1.192193,-0.991083,0.058991,1.703918
4,-0.426319,-0.304572,-2.039875,-1.649675,-0.381991,2.008906,-0.991083,0.058991,-0.648193
...,...,...,...,...,...,...,...,...,...
254,0.018969,-0.304572,-0.262898,0.338764,-1.988834,1.864856,-0.210555,-4.671461,-0.942207
255,-0.084931,-0.304572,0.464047,0.587319,-0.677125,1.320670,-0.210555,-4.671461,1.115890
256,-0.389212,-0.304572,-0.101354,0.255912,0.798547,0.480381,-0.210555,-4.671461,-1.530234
257,-0.530220,-0.304572,0.221732,0.670170,-0.021271,-1.024135,-0.405097,-3.790518,-0.942207


In [123]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(all_X_reg, all_Y_reg, random_state=11)
clf_reg = LinearRegression()
clf_reg.fit(X_train_reg, y_train_reg)
clf_reg.score(X_test_reg, y_test_reg) 

-0.09270106086813558