In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv('weatherAUS.csv')
data

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [3]:
data.dropna(subset=['RainTomorrow'], inplace=True)

In [4]:
year = pd.to_datetime(data.Date).dt.year



In [5]:
data.drop(columns = 'Date', inplace= True)

In [6]:
train = data[year < 2015]
val = data[year == 2015]
test = data[year > 2015]

In [7]:
print(train.shape)
print(val.shape)
print(test.shape)

(98988, 22)
(17231, 22)
(25974, 22)


In [74]:
train_inputs = train.iloc[ : , : -1] ## x_train data 
train_targets = train.iloc[ : , -1]  ## y_train data

In [None]:
train_targets.replace(('Yes', 'No'), (1, 0), inplace=True)

In [None]:
train_targets.tolist()

In [9]:
val_inputs = val.iloc[ : , : -1] ## x_val data
val_targets = val.iloc[ : , -1] ## y_val data

In [113]:
val_targets.replace(('Yes', 'No'), (1, 0), inplace=True)

In [10]:
test_inputs = test.iloc[ : , : -1] ## x_test data 
test_targets = test.iloc[ : , -1] ## x_test data 

In [112]:
test_targets.replace(('Yes', 'No'), (1, 0), inplace=True)

In [13]:
number = train_inputs.select_dtypes(include = np.number).columns.tolist()
number

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm']

In [12]:
category = train_inputs.select_dtypes(include = 'object').columns.tolist()
category

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [18]:
## missing values
train_inputs[number].isnull().sum().sort_values(ascending = False)

Sunshine         40696
Evaporation      37110
Cloud3pm         36766
Cloud9am         35764
Pressure9am       9345
Pressure3pm       9309
WindGustSpeed     6902
Humidity9am       1265
Humidity3pm       1186
WindSpeed3pm      1140
WindSpeed9am      1133
Rainfall          1000
Temp9am            783
Temp3pm            663
MinTemp            434
MaxTemp            198
dtype: int64

In [21]:
## Filling missing values with simple imputer

from sklearn.impute import SimpleImputer

In [23]:
imputer = SimpleImputer(strategy = 'mean').fit(data[number])

In [24]:
train_inputs[number] = imputer.transform(train_inputs[number])
val_inputs[number] = imputer.transform(val_inputs[number])
test_inputs[number] = imputer.transform(test_inputs[number])

In [26]:
train_inputs[number].isnull().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
dtype: int64

In [27]:
data.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,141556.0,141871.0,140787.0,81350.0,74377.0,132923.0,140845.0,139563.0,140419.0,138583.0,128179.0,128212.0,88536.0,85099.0,141289.0,139467.0
mean,12.1864,23.226784,2.349974,5.469824,7.624853,39.984292,14.001988,18.637576,68.84381,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.987509,21.687235
std,6.403283,7.117618,8.465173,4.188537,3.781525,13.588801,8.893337,8.803345,19.051293,20.797772,7.105476,7.036677,2.887016,2.720633,6.492838,6.937594
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4
25%,7.6,17.9,0.0,2.6,4.9,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.5,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1
75%,16.8,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7


In [31]:
## Scaling our dataset

from sklearn.preprocessing import MinMaxScaler

In [32]:
scaler = MinMaxScaler().fit(data[number])

In [33]:
train_inputs[number] = scaler.transform(train_inputs[number])
val_inputs[number] = scaler.transform(val_inputs[number])
test_inputs[number] = scaler.transform(test_inputs[number])

In [35]:
train_inputs[number].describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0,98988.0
mean,0.483557,0.525583,0.006395,0.036934,0.524929,0.265223,0.10847,0.215734,0.686543,0.515015,0.61208,0.608796,0.483856,0.494156,0.506933,0.516827
std,0.14933,0.132006,0.022845,0.021552,0.200488,0.102531,0.068789,0.101451,0.188738,0.206181,0.111257,0.106567,0.254707,0.237394,0.134483,0.130703
min,0.0,0.013233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0304,0.0,0.0,0.027426,0.005758
25%,0.377358,0.429112,0.0,0.026207,0.517241,0.193798,0.053846,0.149425,0.57,0.37,0.543802,0.5408,0.333333,0.333333,0.409283,0.422265
50%,0.478774,0.514178,0.0,0.037723,0.525852,0.255814,0.1,0.218391,0.69,0.52,0.614112,0.610531,0.493021,0.500352,0.50211,0.506718
75%,0.591981,0.618147,0.002156,0.038621,0.634483,0.310078,0.146154,0.275862,0.83,0.65,0.682645,0.6752,0.666667,0.666667,0.601266,0.604607
max,1.0,1.0,1.0,0.568276,0.986207,1.0,0.669231,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.988484


In [36]:
## encoding our categorical data into columns then droping the original categorical columns
from sklearn.preprocessing import OneHotEncoder

In [40]:
data[category].isnull().sum()

Location           0
WindGustDir     9330
WindDir9am     10013
WindDir3pm      3778
RainToday       1406
dtype: int64

In [41]:
data[category].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142193 entries, 0 to 145458
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Location     142193 non-null  object
 1   WindGustDir  132863 non-null  object
 2   WindDir9am   132180 non-null  object
 3   WindDir3pm   138415 non-null  object
 4   RainToday    140787 non-null  object
dtypes: object(5)
memory usage: 6.5+ MB


In [44]:
##replacing the null values with previous row data

data['WindGustDir'].fillna(data['WindGustDir'].mode()[0], inplace=True)
data['WindDir9am'].fillna(data['WindDir9am'].mode()[0], inplace=True)
data['WindDir3pm'].fillna(data['WindDir3pm'].mode()[0], inplace=True)
data['RainToday'].fillna(data['RainToday'].mode()[0], inplace=True)


In [51]:
train_inputs['WindGustDir'].fillna(train_inputs['WindGustDir'].mode()[0], inplace=True)
train_inputs['WindDir9am'].fillna(train_inputs['WindDir9am'].mode()[0], inplace=True)
train_inputs['WindDir3pm'].fillna(train_inputs['WindDir3pm'].mode()[0], inplace=True)
train_inputs['RainToday'].fillna(train_inputs['RainToday'].mode()[0], inplace=True)

In [53]:
val_inputs['WindGustDir'].fillna(val_inputs['WindGustDir'].mode()[0], inplace=True)
val_inputs['WindDir9am'].fillna(val_inputs['WindDir9am'].mode()[0], inplace=True)
val_inputs['WindDir3pm'].fillna(val_inputs['WindDir3pm'].mode()[0], inplace=True)
val_inputs['RainToday'].fillna(val_inputs['RainToday'].mode()[0], inplace=True)

In [52]:
test_inputs['WindGustDir'].fillna(test_inputs['WindGustDir'].mode()[0], inplace=True)
test_inputs['WindDir9am'].fillna(test_inputs['WindDir9am'].mode()[0], inplace=True)
test_inputs['WindDir3pm'].fillna(test_inputs['WindDir3pm'].mode()[0], inplace=True)
test_inputs['RainToday'].fillna(test_inputs['RainToday'].mode()[0], inplace=True)

In [45]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(data[category])

In [46]:
encoded_cols = list(encoder.get_feature_names(category))

In [54]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[category])
val_inputs[encoded_cols] = encoder.transform(val_inputs[category])
test_inputs[encoded_cols] = encoder.transform(test_inputs[category])

In [56]:
X_train = train_inputs[number + encoded_cols]
X_val = val_inputs[number + encoded_cols]
X_test = test_inputs[number + encoded_cols]

In [83]:
## decison Tree

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train,train_targets)

DecisionTreeRegressor(random_state=0)

In [82]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, train_targets)

DecisionTreeClassifier(random_state=42)

In [84]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [97]:
pred = regressor.predict(X_train)

In [92]:
pred2 = model.predict(X_train)

In [101]:
unique, counts = np.unique(pred, return_counts=True)
dict(zip(unique, counts))

{0.0: 76703, 0.5: 4, 1.0: 22281}

In [105]:
unique, counts = np.unique(pred2, return_counts=True)
dict(zip(unique, counts))

{0: 76707, 1: 22281}

In [106]:
accuracy_score(train_targets, pred2)

0.9999797955307714

In [116]:
model.score(X_val, val_targets)


0.791016191747432