In [62]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [53]:
df = pd.read_csv('./data/multicollinearity_data.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [54]:
df = df[list(df.columns[2:])]
df = df.drop(['WindGustDir', 'WindDir9am', 'WindDir3pm'], axis=1)
df = df.dropna()


In [55]:
X = df[list(df.columns[:-2])]
Y = df['RainTomorrow']

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,Column
10,425849.003879,Pressure9am
11,424035.416584,Pressure3pm
15,673.638775,Temp3pm
1,608.073689,MaxTemp
14,208.616616,Temp9am
8,60.741577,Humidity9am
0,57.81633,MinTemp
9,47.893909,Humidity3pm
5,26.320937,WindGustSpeed
4,17.288336,Sunshine


In [56]:
X.values

array([[17.9, 35.2,  0. , ...,  5. , 26.6, 33.4],
       [18.4, 28.9,  0. , ...,  1. , 20.3, 27. ],
       [19.4, 37.6,  0. , ...,  6. , 28.7, 34.9],
       ...,
       [20.7, 32.8,  0. , ...,  0. , 24.8, 32.1],
       [19.5, 31.8,  0. , ...,  1. , 24.8, 29.2],
       [20.2, 31.7,  0. , ...,  5. , 25.4, 31. ]])

In [57]:
# Feature Engineering for mulitcollinearity factors

df['TempDiff'] = df['Temp3pm'] - df['Temp9am']
df['HumidityDiff'] = df['Humidity3pm'] - df['Humidity9am']
df['CloudDiff'] = df['Cloud3pm'] - df['Cloud9am']
df['WindSpeedDiff'] = df['WindSpeed3pm'] - df['WindSpeed9am']
df['PressureDiff'] = df['Pressure3pm'] - df['Pressure9am']

X = df.drop(['Temp3pm', 'Temp9am', 'Humidity3pm', 'Humidity9am', 'Cloud3pm', 'Cloud9am', 'WindSpeed3pm', 'WindSpeed9am', 'Pressure3pm', 'Pressure9am', 'RainToday', 'RainTomorrow'], axis=1)

X.head()


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,TempDiff,HumidityDiff,CloudDiff,WindSpeedDiff,PressureDiff
6049,17.9,35.2,0.0,12.0,12.3,48.0,6.8,-7.0,3.0,14.0,-1.9
6050,18.4,28.9,0.0,14.8,13.0,37.0,6.7,-22.0,0.0,0.0,-0.8
6052,19.4,37.6,0.0,10.8,10.6,46.0,6.2,-20.0,5.0,-15.0,-3.1
6053,21.9,38.4,0.0,11.4,12.2,31.0,6.5,-15.0,4.0,0.0,-3.6
6054,24.2,41.0,0.0,11.2,8.4,35.0,4.0,-4.0,5.0,-4.0,-3.3


In [58]:
vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,Column
1,91.131365,MaxTemp
0,34.935952,MinTemp
6,14.507768,TempDiff
4,8.94055,Sunshine
5,6.924693,WindGustSpeed
3,5.858551,Evaporation
7,5.583406,HumidityDiff
10,3.786961,PressureDiff
9,1.388523,WindSpeedDiff
2,1.217792,Rainfall


In [61]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(Y)
print(encoder.classes_)
print(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X.values, y_encoded)

model = SVC()
model.fit(X.values, y_encoded)

print(model.score(X_test, y_test))

['No' 'Yes']
[0 0 0 ... 0 0 0]
0.8383942711560972


In [64]:
result = permutation_importance(model, X.values, y_encoded, n_repeats=10, random_state=42)

perm_imp_idx = result.importances_mean.argsort()
plt.boxplot(result.importances[perm_imp_idx].T, vert=False,
            labels=X.columns[perm_imp_idx])
plt.title('Feature Importance from Rain in Australia Dataset')
plt.show()
