In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
data_path = "E:\Shubham\DATA Scientist\Electrical_Grid\Data_for_UCI_named.csv"
data_set = pd.read_csv(data_path)

In [4]:
print(data_set.shape)
data_set.head(5)

(10000, 14)


Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [5]:
#As the requirement is to treat the question as classification and from the data description it is known that 
#stab is the predicted value and when stab >0  system is unstable 
#when stab is less than 0 system is stable.
#we could either use one of them 
#I would be going with stabf and drop stab from further exploration
xgb_data = data_set.copy()
data_set.drop(columns =["stab"], inplace = True)


In [6]:
data_set.shape

(10000, 13)

In [7]:
#finding number of null values
data_set.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stabf    0
dtype: int64

In [8]:
# no need of handling the missing vlaues
data_set.dtypes #understanding the data types of features

tau1     float64
tau2     float64
tau3     float64
tau4     float64
p1       float64
p2       float64
p3       float64
p4       float64
g1       float64
g2       float64
g3       float64
g4       float64
stabf     object
dtype: object

In [9]:
data_set.skew(axis = 0)

tau1   -5.411644e-06
tau2    2.740300e-06
tau3   -4.799959e-06
tau4   -1.409046e-06
p1     -1.269001e-02
p2      4.145820e-06
p3      4.459361e-06
p4      9.157719e-07
g1      2.956458e-06
g2     -6.137462e-07
g3      3.204326e-06
g4      3.871024e-06
dtype: float64

In [10]:
from sklearn.model_selection import train_test_split
X = data_set.drop(columns = ["stabf"])
y = data_set["stabf"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, shuffle = True, random_state = 50)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.3, shuffle = True, random_state = 50)

In [12]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 

In [13]:
model = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 50, 
            max_depth = 20, min_samples_leaf = 4) 

In [14]:
model.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=4,
                       random_state=50)

In [15]:
y_pred = model.predict(X_val)

In [16]:
print("Confusion Matrix: ",confusion_matrix(y_val, y_pred)) 
print ("Accuracy : ", accuracy_score(y_val,y_pred)*100) 
print("Report : ", classification_report(y_val, y_pred))

Confusion Matrix:  [[ 694  174]
 [ 191 1341]]
Accuracy :  84.79166666666667
Report :                precision    recall  f1-score   support

      stable       0.78      0.80      0.79       868
    unstable       0.89      0.88      0.88      1532

    accuracy                           0.85      2400
   macro avg       0.83      0.84      0.84      2400
weighted avg       0.85      0.85      0.85      2400



In [17]:
# As the accuracy is very low for the Decision Tree we will go for Random forest which is can handle more complex
# and non linear data
from sklearn.ensemble import RandomForestClassifier
Forest_model = RandomForestClassifier(random_state=5)

In [18]:
Forest_model.fit(X_train,y_train)


RandomForestClassifier(random_state=5)

In [19]:
yf_pred = Forest_model.predict(X_val)

In [20]:
print("Confusion Matrix: ",confusion_matrix(y_val, yf_pred)) 
print ("Accuracy : ", accuracy_score(y_val,yf_pred)*100) 
print("Report : ", classification_report(y_val, yf_pred))

Confusion Matrix:  [[ 736  132]
 [  67 1465]]
Accuracy :  91.70833333333334
Report :                precision    recall  f1-score   support

      stable       0.92      0.85      0.88       868
    unstable       0.92      0.96      0.94      1532

    accuracy                           0.92      2400
   macro avg       0.92      0.90      0.91      2400
weighted avg       0.92      0.92      0.92      2400



In [22]:
ytest_pred = Forest_model.predict(X_test)

In [23]:
print ("Accuracy : ", accuracy_score(y_test,ytest_pred)*100) 

Accuracy :  90.60000000000001


In [None]:
# the accuracy is much better for the Random forest Classifier as it can handle more complex data
# we can try Xgboost classifier also