In [1]:
#importing required libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import imblearn

In [2]:
#link to the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv"

In [3]:
#reading the csv file form the url containing the data
grid_df = pd.read_csv(url , encoding= "latin-1")
#Dataset used for analysis
grid_df

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,0.049860,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,stable
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,unstable


In [5]:
#checking distribution of target variable
grid_df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [6]:
#checking for rows with missing values
grid_df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [7]:
#since theres a direct relationship between columns stab and stabf we drop column stab
grid_df = grid_df.drop(columns= ['stab'])

In [8]:
grid_df.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')

In [9]:
#preprocessing
x = grid_df.drop(columns=['stabf'])
y= grid_df['stabf']

In [12]:
#split data into training and testing sets16
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [21]:
y_train.value_counts()


unstable    5092
stable      2908
Name: stabf, dtype: int64

In [22]:
#using SMOTE only on the training data to handle the imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 1)
x_train_balanced, y_balanced = smote.fit_resample(x_train, y_train)

In [25]:
y_balanced.value_counts()

unstable    5092
stable      5092
Name: stabf, dtype: int64

In [26]:
#Using the standard scaler to transform the train set
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standard_train_df = scaler.fit_transform(x_train_balanced)
standard_train_df = pd.DataFrame(standard_train_df, columns = x_train_balanced.columns)


In [27]:
#Using the standard scaler to transform the test set
standard_test_df = scaler.transform(x_test)
standard_test_df = pd.DataFrame(standard_test_df, columns = x_test.columns)


In [28]:
#Logistic Regression

from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators = 5, criterion ='entropy', max_features = 2)

etc.fit(standard_train_df, y_balanced)

feature_importance = etc.feature_importances_

In [29]:
feature_importance

array([0.11256985, 0.11181795, 0.10871009, 0.10547743, 0.04450847,
       0.04843365, 0.04928597, 0.0495139 , 0.08831689, 0.09045177,
       0.09976888, 0.09114515])

In [30]:
max(feature_importance)

0.11256984574245

In [31]:
min(feature_importance)

0.044508470162962104