In [102]:
import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot as plt

orig = pd.read_csv('weatherAUS.csv')

orig.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [103]:
df = orig.copy()

# YES/NO => 1/0
df.RainTomorrow = df.RainTomorrow.map(lambda e: int(e == 'Yes'))
df.RainToday = df.RainToday.map(lambda e: int(e == 'Yes'))

# "2020-03-04" => [3], [4]
month = df.Date.map(lambda e : e[5:7]).astype(int)
date = df.Date.map(lambda e : e[8:]).astype(int)
df.insert(value=month, column='month', loc=0)
df.Date = date
df = df.drop(columns=['Date'])

df.head()

Unnamed: 0,month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,12,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0,0
1,12,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0,0
2,12,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0,0
3,12,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0,0
4,12,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0


In [104]:
# map string values to int enums
labelEncoders = {}

for col in df.columns:
    if not is_numeric_dtype(df[col].dtype):
        le = LabelEncoder()
        le.fit(df[col])
        df[col] = le.transform(df[col])

        print(col, le.classes_)
        labelEncoders[col] = le

    df[col] = df[col].fillna(0)

df.head()

Location ['Adelaide' 'Albany' 'Albury' 'AliceSprings' 'BadgerysCreek' 'Ballarat'
 'Bendigo' 'Brisbane' 'Cairns' 'Canberra' 'Cobar' 'CoffsHarbour'
 'Dartmoor' 'Darwin' 'GoldCoast' 'Hobart' 'Katherine' 'Launceston'
 'Melbourne' 'MelbourneAirport' 'Mildura' 'Moree' 'MountGambier'
 'MountGinini' 'Newcastle' 'Nhil' 'NorahHead' 'NorfolkIsland' 'Nuriootpa'
 'PearceRAAF' 'Penrith' 'Perth' 'PerthAirport' 'Portland' 'Richmond'
 'Sale' 'SalmonGums' 'Sydney' 'SydneyAirport' 'Townsville' 'Tuggeranong'
 'Uluru' 'WaggaWagga' 'Walpole' 'Watsonia' 'Williamtown' 'Witchcliffe'
 'Wollongong' 'Woomera']
WindGustDir ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW' nan]
WindDir9am ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW' nan]
WindDir3pm ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW' nan]


Unnamed: 0,month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,12,2,13.4,22.9,0.6,0.0,0.0,13,44.0,13,...,71.0,22.0,1007.7,1007.1,8.0,0.0,16.9,21.8,0,0
1,12,2,7.4,25.1,0.0,0.0,0.0,14,44.0,6,...,44.0,25.0,1010.6,1007.8,0.0,0.0,17.2,24.3,0,0
2,12,2,12.9,25.7,0.0,0.0,0.0,15,46.0,13,...,38.0,30.0,1007.6,1008.7,0.0,2.0,21.0,23.2,0,0
3,12,2,9.2,28.0,0.0,0.0,0.0,4,24.0,9,...,45.0,16.0,1017.6,1012.8,0.0,0.0,18.1,26.5,0,0
4,12,2,17.5,32.3,1.0,0.0,0.0,13,41.0,1,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0


In [105]:
# Train a decision tree to check feature importance
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

X = df.drop(columns=['RainTomorrow'])
y = df.RainTomorrow

forest.fit(X, y)


ExtraTreesClassifier(n_estimators=250, random_state=0)

In [106]:
importances = forest.feature_importances_

std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

indices = np.argsort(importances)[::-1] # desc

featureCnt = X.shape[1]

impDetails = []

for i in range(featureCnt):
    featureName = df.columns[i]
    impDetails.append([i + 1, featureName, importances[indices[i]]])
    
ftImp = pd.DataFrame(data=impDetails, columns=['Ranking', 'Feature', 'Importance'])

ftImp

Unnamed: 0,Ranking,Feature,Importance
0,1,month,0.145094
1,2,Location,0.0581
2,3,MinTemp,0.055207
3,4,MaxTemp,0.051953
4,5,Rainfall,0.047399
5,6,Evaporation,0.04606
6,7,Sunshine,0.043001
7,8,WindGustDir,0.042746
8,9,WindGustSpeed,0.042496
9,10,WindDir9am,0.040457


In [111]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

print('train: ', X_train.shape)
print('validation: ', X_val.shape)
print('test: ', X_test.shape)

clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)

train:  (104731, 22)
validation:  (11637, 22)
test:  (29092, 22)


array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [112]:
y_predict = clf.predict(X_test)
accuracy_score(y_test, y_predict)

0.8170974838443559