In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import warnings
warnings.filterwarnings('ignore')

### 讀檔

In [2]:
weather = pd.read_csv('weatherAUS.csv') 
weather

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [3]:
weather['RainTomorrow'].isnull().sum() #有3267個缺失值

3267

In [4]:
weather['RainTomorrow'].unique() #相異值長相

array(['No', 'Yes', nan], dtype=object)

In [5]:
categorical = [var for var in weather.columns if weather[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :', categorical)

There are 7 categorical variables

The categorical variables are : ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


In [6]:
weather[categorical].head()

Unnamed: 0,Date,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,W,W,WNW,No,No
1,2008-12-02,Albury,WNW,NNW,WSW,No,No
2,2008-12-03,Albury,WSW,W,WSW,No,No
3,2008-12-04,Albury,NE,SE,E,No,No
4,2008-12-05,Albury,W,ENE,NW,No,No


In [7]:
for var in categorical:
    print(var, ' contains ', len(weather[var].unique()), ' labels')

Date  contains  3436  labels
Location  contains  49  labels
WindGustDir  contains  17  labels
WindDir9am  contains  17  labels
WindDir3pm  contains  17  labels
RainToday  contains  3  labels
RainTomorrow  contains  3  labels


In [8]:
weather[categorical].isnull().sum() #object之missing value個數

Date                0
Location            0
WindGustDir     10326
WindDir9am      10566
WindDir3pm       4228
RainToday        3261
RainTomorrow     3267
dtype: int64

In [9]:
weather['Date'] = pd.to_datetime(weather['Date'])
weather['Day'] = weather['Date'].dt.day
weather['Month'] = weather['Date'].dt.month
weather['Year'] = weather['Date'].dt.year
weather.drop('Date', axis=1, inplace = True) #date已拆，所以拿掉

In [10]:
#填滿object空值
weather['WindGustDir'].fillna(weather['WindGustDir'].mode()[0], inplace=True)
weather['WindDir9am'].fillna(weather['WindDir9am'].mode()[0], inplace=True)
weather['WindDir3pm'].fillna(weather['WindDir3pm'].mode()[0], inplace=True)
weather['RainToday'].fillna(weather['RainToday'].mode()[0], inplace=True)

In [11]:
numerical = [col for col in weather.columns if weather[col].dtypes != 'O']
numerical

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'Day',
 'Month',
 'Year']

In [12]:
#print(round(weather[numerical].describe()),2)

In [13]:
#missing value in x_train
weather[numerical].isnull().sum()

MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustSpeed    10263
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
Day                  0
Month                0
Year                 0
dtype: int64

In [14]:
# impute missing values in X_train and X_test with respective column median in X_train

for df1 in [weather]:
    for col in numerical:
        col_median=weather[col].median() #找出欄位之中位數
        df1[col].fillna(col_median, inplace=True) #丟中位數進去空值中

# check again missing values in numerical variables in X_train
weather[numerical].isnull().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
Day              0
Month            0
Year             0
dtype: int64

In [15]:
weather

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Day,Month,Year
0,Albury,13.4,22.9,0.6,4.8,8.4,W,44.0,W,WNW,...,1007.1,8.0,5.0,16.9,21.8,No,No,1,12,2008
1,Albury,7.4,25.1,0.0,4.8,8.4,WNW,44.0,NNW,WSW,...,1007.8,5.0,5.0,17.2,24.3,No,No,2,12,2008
2,Albury,12.9,25.7,0.0,4.8,8.4,WSW,46.0,W,WSW,...,1008.7,5.0,2.0,21.0,23.2,No,No,3,12,2008
3,Albury,9.2,28.0,0.0,4.8,8.4,NE,24.0,SE,E,...,1012.8,5.0,5.0,18.1,26.5,No,No,4,12,2008
4,Albury,17.5,32.3,1.0,4.8,8.4,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,No,5,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,Uluru,2.8,23.4,0.0,4.8,8.4,E,31.0,SE,ENE,...,1020.3,5.0,5.0,10.1,22.4,No,No,21,6,2017
145456,Uluru,3.6,25.3,0.0,4.8,8.4,NNW,22.0,SE,N,...,1019.1,5.0,5.0,10.9,24.5,No,No,22,6,2017
145457,Uluru,5.4,26.9,0.0,4.8,8.4,N,37.0,SE,WNW,...,1016.8,5.0,5.0,12.5,26.1,No,No,23,6,2017
145458,Uluru,7.8,27.0,0.0,4.8,8.4,SE,28.0,SSE,N,...,1016.5,3.0,2.0,15.1,26.0,No,No,24,6,2017


In [16]:
# find outliers for Rainfall variable

IQR = weather.Rainfall.quantile(0.75) - weather.Rainfall.quantile(0.25)
Lower_fence = weather.Rainfall.quantile(0.25) - (IQR * 3)
Upper_fence = weather.Rainfall.quantile(0.75) + (IQR * 3)
print('Rainfall outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Rainfall outliers are values < -1.7999999999999998 or > 2.4


In [17]:
# find outliers for Evaporation variable

IQR = weather.Evaporation.quantile(0.75) - weather.Evaporation.quantile(0.25)
Lower_fence = weather.Evaporation.quantile(0.25) - (IQR * 3)
Upper_fence = weather.Evaporation.quantile(0.75) + (IQR * 3)
print('Evaporation outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Evaporation outliers are values < 0.39999999999999947 or > 8.8


In [18]:
# find outliers for WindSpeed9am variable

IQR = weather.WindSpeed9am.quantile(0.75) - weather.WindSpeed9am.quantile(0.25)
Lower_fence = weather.WindSpeed9am.quantile(0.25) - (IQR * 3)
Upper_fence = weather.WindSpeed9am.quantile(0.75) + (IQR * 3)
print('WindSpeed9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

WindSpeed9am outliers are values < -29.0 or > 55.0


In [19]:
# find outliers for WindSpeed3pm variable

IQR = weather.WindSpeed3pm.quantile(0.75) - weather.WindSpeed3pm.quantile(0.25)
Lower_fence = weather.WindSpeed3pm.quantile(0.25) - (IQR * 3)
Upper_fence = weather.WindSpeed3pm.quantile(0.75) + (IQR * 3)
print('WindSpeed3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

WindSpeed3pm outliers are values < -20.0 or > 57.0


In [20]:
#去除outlier，MAX取代
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])
    #np.where(condition, x, y) : 滿足條件(condition)，輸出x，不滿足輸出y。

for df3 in [weather]:
    df3['Rainfall'] = max_value(df3, 'Rainfall', 2.4) #outlier以最大值代替
    df3['Evaporation'] = max_value(df3, 'Evaporation', 8.8)
    df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
    df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57) 

In [21]:
weather

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Day,Month,Year
0,Albury,13.4,22.9,0.6,4.8,8.4,W,44.0,W,WNW,...,1007.1,8.0,5.0,16.9,21.8,No,No,1,12,2008
1,Albury,7.4,25.1,0.0,4.8,8.4,WNW,44.0,NNW,WSW,...,1007.8,5.0,5.0,17.2,24.3,No,No,2,12,2008
2,Albury,12.9,25.7,0.0,4.8,8.4,WSW,46.0,W,WSW,...,1008.7,5.0,2.0,21.0,23.2,No,No,3,12,2008
3,Albury,9.2,28.0,0.0,4.8,8.4,NE,24.0,SE,E,...,1012.8,5.0,5.0,18.1,26.5,No,No,4,12,2008
4,Albury,17.5,32.3,1.0,4.8,8.4,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,No,5,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,Uluru,2.8,23.4,0.0,4.8,8.4,E,31.0,SE,ENE,...,1020.3,5.0,5.0,10.1,22.4,No,No,21,6,2017
145456,Uluru,3.6,25.3,0.0,4.8,8.4,NNW,22.0,SE,N,...,1019.1,5.0,5.0,10.9,24.5,No,No,22,6,2017
145457,Uluru,5.4,26.9,0.0,4.8,8.4,N,37.0,SE,WNW,...,1016.8,5.0,5.0,12.5,26.1,No,No,23,6,2017
145458,Uluru,7.8,27.0,0.0,4.8,8.4,SE,28.0,SSE,N,...,1016.5,3.0,2.0,15.1,26.0,No,No,24,6,2017


In [22]:
#correlation = weather.corr()
#Compute pairwise correlation of columns(計算列的成對相關), excluding NA/null values

In [23]:
#plt.figure(figsize=(16,12))
#plt.title('Correlation Heatmap of Rain in Australia Dataset')
#ax = sns.heatmap(correlation, square=True, annot=True, fmt='.2f', linecolor='white')
#ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
#ax.set_yticklabels(ax.get_yticklabels(), rotation=30)           
#plt.show()

Interpretation
From the above correlation heat map, we can conclude that :-

1. MinTemp and MaxTemp variables are highly positively correlated (correlation coefficient = 0.74).

2. MinTemp and Temp3pm variables are also highly positively correlated (correlation coefficient = 0.71).

3. MinTemp and Temp9am variables are strongly positively correlated (correlation coefficient = 0.90).

4. MaxTemp and Temp9am variables are strongly positively correlated (correlation coefficient = 0.89).

5. MaxTemp and Temp3pm variables are also strongly positively correlated (correlation coefficient = 0.98).

6. WindGustSpeed and WindSpeed3pm variables are highly positively correlated (correlation coefficient = 0.69).

7. Pressure9am and Pressure3pm variables are strongly positively correlated (correlation coefficient = 0.96).

8. Temp9am and Temp3pm variables are strongly positively correlated (correlation coefficient = 0.86).

In [24]:
#num_var = ['MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm', 'WindGustSpeed', 'WindSpeed3pm', 'Pressure9am', 'Pressure3pm']
#sns.pairplot(weather[num_var], kind='scatter', diag_kind='hist', palette='Rainbow')
#plt.show()

In [25]:
weather=weather.dropna()

In [26]:
pd.get_dummies(weather.Location, drop_first=True).head()

Unnamed: 0,Albany,Albury,AliceSprings,BadgerysCreek,Ballarat,Bendigo,Brisbane,Cairns,Canberra,Cobar,...,Townsville,Tuggeranong,Uluru,WaggaWagga,Walpole,Watsonia,Williamtown,Witchcliffe,Wollongong,Woomera
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
pd.get_dummies(weather.WindGustDir, drop_first=True,).head()

Unnamed: 0,ENE,ESE,N,NE,NNE,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [28]:
pd.get_dummies(weather.WindDir9am, drop_first=True,).head()

Unnamed: 0,ENE,ESE,N,NE,NNE,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
pd.get_dummies(weather.WindDir3pm, drop_first=True,).head()

Unnamed: 0,ENE,ESE,N,NE,NNE,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [30]:
pd.get_dummies(weather.RainToday, drop_first=True,).head()

Unnamed: 0,Yes
0,0
1,0
2,0
3,0
4,0


In [31]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['RainToday']) #one hot encode
weather = encoder.fit_transform(weather) #轉換過去(fit)

In [32]:
weather = pd.concat([weather['RainTomorrow'],weather[numerical], weather[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(weather.Location), 
                     pd.get_dummies(weather.WindGustDir),
                     pd.get_dummies(weather.WindDir9am),
                     pd.get_dummies(weather.WindDir3pm)], axis=1) #把以上東東合併

In [33]:
weather

Unnamed: 0,RainTomorrow,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
0,No,13.4,22.9,0.6,4.8,8.4,44.0,20.0,24.0,71.0,...,0,0,0,0,0,0,0,0,1,0
1,No,7.4,25.1,0.0,4.8,8.4,44.0,4.0,22.0,44.0,...,0,0,0,0,0,0,0,0,0,1
2,No,12.9,25.7,0.0,4.8,8.4,46.0,19.0,26.0,38.0,...,0,0,0,0,0,0,0,0,0,1
3,No,9.2,28.0,0.0,4.8,8.4,24.0,11.0,9.0,45.0,...,0,0,0,0,0,0,0,0,0,0
4,No,17.5,32.3,1.0,4.8,8.4,41.0,7.0,20.0,82.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,No,3.5,21.8,0.0,4.8,8.4,31.0,15.0,13.0,59.0,...,0,0,0,0,0,0,0,0,0,0
145455,No,2.8,23.4,0.0,4.8,8.4,31.0,13.0,11.0,51.0,...,0,0,0,0,0,0,0,0,0,0
145456,No,3.6,25.3,0.0,4.8,8.4,22.0,13.0,9.0,56.0,...,0,0,0,0,0,0,0,0,0,0
145457,No,5.4,26.9,0.0,4.8,8.4,37.0,9.0,9.0,53.0,...,0,0,0,0,0,0,0,0,1,0


In [35]:
#定義X,Y
X = weather.drop(['RainTomorrow'], axis=1)
Y = weather['RainTomorrow']

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [37]:
X_train.shape, X_test.shape

((113754, 118), (28439, 118))

In [38]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [40]:
Y_train.isnull().sum()

0

In [41]:
from sklearn.linear_model import LogisticRegression
# instantiate the model
logreg = LogisticRegression(solver='liblinear', random_state=0)
# fit the model
logreg.fit(X_train, Y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [42]:
y_pred_test = logreg.predict(X_test)
y_pred_test

array(['No', 'No', 'No', ..., 'No', 'No', 'Yes'], dtype=object)

In [43]:
# probability of getting output as 0 - no rain
logreg.predict_proba(X_test)[:,0]

array([0.92375273, 0.85137057, 0.85638237, ..., 0.98238379, 0.83123121,
       0.36162832])