In [64]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score

# 1. Data import

In [2]:
# import the preprocessed data 
data = pd.read_csv('preprocessed_data.csv')
target = pd.read_csv('target.csv')

data.drop(columns ="Unnamed: 0", inplace=True)
target.drop(columns ="Unnamed: 0", inplace=True)

display(data)
display(target)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,RainToday,HumidityDelta,PressureDelta,DeltaTemp
0,21,13.4,22.9,0.6,44.0,71.0,22.0,1007.7,1007.1,0.0,-49.0,-0.6,4.9
1,21,7.4,25.1,0.0,44.0,44.0,25.0,1010.6,1007.8,0.0,-19.0,-2.8,7.1
2,21,12.9,25.7,0.0,46.0,38.0,30.0,1007.6,1008.7,0.0,-8.0,1.1,2.2
3,21,9.2,28.0,0.0,24.0,45.0,16.0,1017.6,1012.8,0.0,-29.0,-4.8,8.4
4,21,17.5,32.3,1.0,41.0,82.0,33.0,1010.8,1006.0,0.0,-49.0,-4.8,11.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,1,3.5,21.8,0.0,31.0,59.0,27.0,1024.7,1021.2,0.0,-32.0,-3.5,11.5
142189,1,2.8,23.4,0.0,31.0,51.0,24.0,1024.6,1020.3,0.0,-27.0,-4.3,12.3
142190,1,3.6,25.3,0.0,22.0,56.0,21.0,1023.5,1019.1,0.0,-35.0,-4.4,13.6
142191,1,5.4,26.9,0.0,37.0,53.0,24.0,1021.0,1016.8,0.0,-29.0,-4.2,13.6


Unnamed: 0,RainTomorrow
0,0
1,0
2,0
3,0
4,0
...,...
142188,0
142189,0
142190,0
142191,0


In [3]:
# drop rows where there is nan (side effects of our preprocessing script)

index_to_keep = np.where(~data.applymap(np.isnan).apply(any, axis=1))[0]
data = data.iloc[index_to_keep]
target = target.iloc[index_to_keep]

In [4]:
# Train/test split our data

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)

# 2. Decision tree classifier

### 1. Train and predict on test dataset

In [84]:
# train classifier on train dataset
clf =DecisionTreeClassifier()
clf.fit(X_train, y_train)

# prediction on test dataset
y_pred = clf.predict(X_test)

print(accuracy_score(y_pred, y_test)) # prediction accuracy

0.7904761904761904


In [85]:
# estimation of the mean and variance of accuracy for our predictions
scores = cross_val_score(clf, X_test, y_test, cv=5)
str(scores.mean()) + " +/- " +  str(scores.std())

'0.7667084377610693 +/- 0.005843476028108124'

### 2. Results analysis

In [86]:
# Did the classifier learned a specific decision strategy ? (per example : it labels easily to 0, because it's the class of the majority)

# prediction score on the sub set of days with RainTomorrow = 1 :
idx = np.where(y_test==1)[0]

print(accuracy_score(y_pred[idx], y_test.iloc[idx])) 
# accuracy is 30% worse when we consider only the days with "RainTommorow" = 1 

0.545957284515637


In [83]:
# since our class are unbalanced, we may want to use f-beta-measure 

beta = len(np.where(target==1)[0])/len(np.where(target==0)[0])
print("beta = ",beta) # we take beta = the ratio of the minoritary class to the majoritary class

print("f-beta score =", fbeta_score(y_pred, y_test, beta))

beta =  0.2804681065006472
f-beta score = 0.5184024501689455




In [76]:
# Average performances and variance of the model
# We learn the same model multiple times, using 5-fold stratified learning
# We evaluate each model f beta score, after prediction on the test set

skf = StratifiedKFold(n_splits=5,shuffle=True)
scores = []

for train_index, test_index in skf.split(data, target):
    # 4 folds compose the training set, one for the validation set
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # train our model
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(fbeta_score(y_pred, y_test, beta = beta))


print(str(np.mean(scores)) + " +/- " + str(np.std(scores)))

0.5457194026575017 +/- 0.006979298937955948


### 3. Model optimisation

In [63]:
# Gridsearch for best estimator



0.2894736842105263