In [1]:
# importing my libraries

import pandas as pd
import numpy as np

In [2]:
# importing my dataset
df = pd.read_csv("Data_for_UCI_named.csv")

# viewing the head of my dataset
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
# dropping the stab column because it is highly correlated with stabf
df.drop("stab", axis = 1, inplace=True)

# viewing only the first row of my dataset
df.head(1)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable


In [4]:
# importing my encoder
from sklearn.preprocessing import LabelEncoder

# instantiating my model as an object
encoder = LabelEncoder()

In [5]:
# encoding my target variable 

df["stabf"] = encoder.fit_transform(df["stabf"])

#### AFTER ENCODING MY TARGET VARIABLE
- unstable became 1 and then 
- stable became 0

In [6]:
# importing train test split to split my dataset into train and test set

from sklearn.model_selection import train_test_split

In [7]:
# defining my features and target and setting them into new variables

X = df.drop("stabf", axis = 1)
y = df["stabf"]

# splitting my target and features into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
# importing my standard scaler 
from sklearn.preprocessing import StandardScaler

# instantiating the standard scaler as an object
scaler = StandardScaler()

In [9]:
# checking the shape of my train and test features before scaling them 

X_train.shape, X_test.shape

((8000, 12), (2000, 12))

In [None]:
# scaling the features of my train dataset using the standard and setting it back into a dataframe

normalized_train_df = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

please note that my x_train has now been turned to normalized_train_df

In [11]:
# scaling the features of my test dataset using the standard and setting it back into a dataframe

normalized_test_df = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

please note that my x_test has now been turned to normalized_test_df

In [12]:
# checking the shape of my X_train, normalized_train_df, X_test and normalized_test_df for consistency

X_train.shape, normalized_train_df.shape, X_test.shape, normalized_test_df.shape

((8000, 12), (8000, 12), (2000, 12), (2000, 12))

## Using the random forest

In [13]:
#importing the random forest

from sklearn.ensemble import RandomForestClassifier

# instantiating the random forest classifier
rfc = RandomForestClassifier(random_state = 1)

In [14]:
# fitting the random forest to my dataaset
rfc.fit(normalized_train_df, y_train)

RandomForestClassifier(random_state=1)

In [15]:
# using the random forest to make predictions

random_forest_predictions = rfc.predict(normalized_test_df)

In [16]:
# importing the metrics to evaluate the performance of my my model

from sklearn.metrics import classification_report, recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

In [229]:
# using the metrics to evaluate the performace of my model

print("The recall score for the random forest classifiction is: ", round(recall_score(y_test,random_forest_predictions), 4))
print("The accuracy score for the random forest classifiction is: ", round(accuracy_score(y_test, random_forest_predictions), 4))
print("The precision score for the random forest classifiction is: ", round(precision_score(y_test, random_forest_predictions), 4))
print("The f1_score for the random forest classifiction is: ", round(f1_score(y_test, random_forest_predictions), 4))
print("\n")
print("THIS IS THE CLASSIFICATION REPORT OF THE RANDOM FOREST PREDICTION")
print("=============================================================================")
print("\n")
print(classification_report(y_test, random_forest_predictions))

The recall score for the random forest classifiction is:  0.9573
The accuracy score for the random forest classifiction is:  0.929
The precision score for the random forest classifiction is:  0.9341
The f1_score for the random forest classifiction is:  0.9456


THIS IS THE CLASSIFICATION REPORT OF THE RANDOM FOREST PREDICTION


              precision    recall  f1-score   support

           0       0.92      0.88      0.90       712
           1       0.93      0.96      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.92      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [179]:
print("\n")
print("THIS IS THE CONFUSION MATRIX FOR THE RANDOM FOREST CLASSIFICATION")
print("====================================================================")
random_forest_cnf = confusion_matrix(y_test, random_forest_predictions)

random_forest_cnf = pd.DataFrame(random_forest_cnf, columns = ["True-Positive", "True-Negative"],
                                 index = ["Predicted-Positive", "Predicted-Negative"])
random_forest_cnf



THIS IS THE CONFUSION MATRIX FOR THE RANDOM FOREST CLASSIFICATION


Unnamed: 0,True-Positive,True-Negative
Predicted-Positive,625,87
Predicted-Negative,55,1233


## Using extra tress classifier 

In [66]:
# importing  the random extra tree 
from sklearn.ensemble import ExtraTreesClassifier

# instantiating the extra tress classifier as an object
extra_tree = ExtraTreesClassifier(random_state = 1)

In [67]:
# fitting the extra tree ot my train dataset
extra_tree.fit(normalized_train_df, y_train)

ExtraTreesClassifier(random_state=1)

In [68]:
# using the extra trees to make predictions
extra_tree_predictions = extra_tree.predict(normalized_test_df)

In [254]:
# Evaluating the metrics of the model

print("The recall score for the extra trees classifiction is: ", round(recall_score(y_test, extra_tree_predictions), 4))
print("The accuracy score for the extra trees classifiction is: ", accuracy_score(y_test, extra_tree_predictions))
print("The precision score for the extra trees classifiction is: ", round(precision_score(y_test, extra_tree_predictions), 4))
print("The f1_score for the extra trees classifiction is: ", round(f1_score(y_test, extra_tree_predictions), 4))
print("\n")
print("THIS IS THE CLASSIFICATION REPORT OF THE EXTRA TREE PREDICTIONS")
print("==================================================================")
print("\n")
print(classification_report(y_test, extra_tree_predictions))

The recall score for the extra trees classifiction is:  0.9705
The accuracy score for the extra trees classifiction is:  0.928
The precision score for the extra trees classifiction is:  0.9218
The f1_score for the extra trees classifiction is:  0.9455


THIS IS THE CLASSIFICATION REPORT OF THE EXTRA TREE PREDICTIONS


              precision    recall  f1-score   support

           0       0.94      0.85      0.89       712
           1       0.92      0.97      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [182]:
print("\n")
print("THIS IS THE CONFUSION MATRIX FOR THE RANDOM FOREST CLASSIFICATION")
extra_tree_cnf = confusion_matrix(y_test, extra_tree_predictions)

extra_tree_cnf = pd.DataFrame(extra_tree_cnf, columns = ["True-Positive", "True-Negative"],
                                 index = ["Predicted-Positive", "Predicted-Negative"])
print("==================================================================")
extra_tree_cnf



THIS IS THE CONFUSION MATRIX FOR THE RANDOM FOREST CLASSIFICATION


Unnamed: 0,True-Positive,True-Negative
Predicted-Positive,606,106
Predicted-Negative,38,1250


## Using the lightgbm

In [202]:
# importing the light gbm
import lightgbm as lgb

# Instantiating the object of the model
lgbm = lgb.LGBMClassifier()

In [203]:
lgbm.fit(normalized_train_df, y_train)

LGBMClassifier()

In [204]:
lgbm_prediction = lgbm.predict(normalized_test_df)

In [230]:
# Evaluating the performance of the model using the metric

print("The recall score for the classifiction is: ", round(recall_score(y_test, lgbm_prediction), 4))
print("The accuracy score for the classifiction is: ", round(accuracy_score(y_test, lgbm_prediction), 4))
print("The precision score for the classifiction is: ", round(precision_score(y_test, lgbm_prediction), 4))
print("The f1_score for the classifiction is: ", round(f1_score(y_test, lgbm_prediction), 4))
print("\n")
print("THIS IS THE CLASSIFICATION REPORT OF THE LIGHTGBM PREDICTIONS")
print("==============================================================")
print("\n")
print(classification_report(y_test, lgbm_prediction))

The recall score for the classifiction is:  0.9627
The accuracy score for the classifiction is:  0.9375
The precision score for the classifiction is:  0.9415
The f1_score for the classifiction is:  0.952


THIS IS THE CLASSIFICATION REPORT OF THE LIGHTGBM PREDICTIONS


              precision    recall  f1-score   support

           0       0.93      0.89      0.91       712
           1       0.94      0.96      0.95      1288

    accuracy                           0.94      2000
   macro avg       0.94      0.93      0.93      2000
weighted avg       0.94      0.94      0.94      2000



In [209]:
print("\n")
print("THIS IS THE CONFUSION MATRIX FOR THE LIGHTGBM")
lightgbm_cnf = confusion_matrix(y_test, lgbm_prediction)

lightgbm_cnf = pd.DataFrame(lightgbm_cnf, columns = ["True-Positive", "True-Negative"],
                                 index = ["Predicted-Positive", "Predicted-Negative"])
print("==================================================")
lightgbm_cnf



THIS IS THE CONFUSION MATRIX FOR THE LIGHTGBM


Unnamed: 0,True-Positive,True-Negative
Predicted-Positive,635,77
Predicted-Negative,48,1240


## Using the XGBoost

In [167]:
# importing my xgb classifier
from xgboost import XGBClassifier

# instantiating the model as an object
xgb = XGBClassifier(random_state = 1)

In [168]:
# fitting our xgb to the trainning set
xgb.fit(normalized_train_df, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [169]:
# making predictions with our xgboost
xgb_predictions = xgb.predict(normalized_test_df)

In [211]:
# Evaluating the metrics of the model

print("The recall score for the classifiction is: ", round(recall_score(y_test, xgb_predictions), 4))
print("The accuracy score for the classifiction is: ", round(accuracy_score(y_test, xgb_predictions), 4))
print("The precision score for the classifiction is: ", round(precision_score(y_test, xgb_predictions), 4))
print("The f1_score for the classifiction is: ", round(f1_score(y_test, xgb_predictions), 4))
print("\n")
print("THIS IS THE CLASSIFICATION REPORT OF THE GRADIENT BOOST")
print("========================================================")
print("\n")
print(classification_report(y_test, xgb_predictions))

The recall score for the classifiction is:  0.9651
The accuracy score for the classifiction is:  0.9455
The precision score for the classifiction is:  0.951
The f1_score for the classifiction is:  0.958


THIS IS THE CLASSIFICATION REPORT OF THE GRADIENT BOOST


              precision    recall  f1-score   support

           0       0.94      0.91      0.92       712
           1       0.95      0.97      0.96      1288

    accuracy                           0.95      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.95      0.95      0.95      2000



In [213]:
print("\n")
print("THIS IS THE CONFUSION MATRIX FOR THE GRADIENT BOOSTING")
xgb_cnf = confusion_matrix(y_test, xgb_predictions)

xgb_cnf = pd.DataFrame(xgb_cnf, columns = ["True-Positive", "True-Negative"],
                                 index = ["Predicted-Positive", "Predicted-Negative"])
print("========================================================")
xgb_cnf



THIS IS THE CONFUSION MATRIX FOR THE GRADIENT BOOSTING


Unnamed: 0,True-Positive,True-Negative
Predicted-Positive,648,64
Predicted-Negative,45,1243


## improving our extra tress using the randomized cross validation search

In [87]:
# importing the evaluating model to search through our models to get the best parameters

from sklearn.model_selection import RandomizedSearchCV

In [95]:
# creating a dictionary of the tuning the Randomizedsearchcv will use 

param = {"n_estimators" : [50, 100, 300, 500, 1000],
         "min_samples_split" : [2, 3, 5, 7, 9],
         "min_samples_leaf" : [1, 2, 4, 6, 8], 
         "max_features" : ['auto', 'sqrt', 'log2', None],
        } 

In [235]:
# instantiating the randomized search cv and setting my cv to 5 so the randomized cv can ceate five folds and run
# cross validation on each fold and setting my scoring as accuracy
# that is the randomizeed search cv will use the accuracy to score each fold

rscv = RandomizedSearchCV(estimator = extra_tree,
                          param_distributions = param,
                          random_state = 1, 
                          cv = 5,
                          scoring = "accuracy",
                          n_jobs = -1, 
                          verbose = 1,
                         )

In [236]:
# fitting the randomized search cv to our training set
search = rscv.fit(normalized_train_df, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.2min finished


In [237]:
# getting the best parameters for the best model
search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [238]:
# using the model to make predictions. the randomized search cv will use the best parameters
search_predictions = rscv.predict(normalized_test_df)

In [239]:
# Evaluate the performance of our model using the metrics 

print("The recall score for the classifiction is: ", round(recall_score(y_test, search_predictions), 4))
print("The accuracy score for the classifiction is: ", round(accuracy_score(y_test, search_predictions), 4))
print("The precision score for the classifiction is: ", round(precision_score(y_test, search_predictions), 4))
print("The f1_score for the classifiction is: ", round(f1_score(y_test, search_predictions), 4))
print("\n")
print("THIS IS THE CLASSIFICATION REPORT OF THE RANDOMIZED SEARCH CV")
print("========================================================")
print("\n")
print(classification_report(y_test, search_predictions))

The recall score for the classifiction is:  0.9589
The accuracy score for the classifiction is:  0.927
The precision score for the classifiction is:  0.93
The f1_score for the classifiction is:  0.9442


THIS IS THE CLASSIFICATION REPORT OF THE RANDOMIZED SEARCH CV


              precision    recall  f1-score   support

           0       0.92      0.87      0.89       712
           1       0.93      0.96      0.94      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [240]:
print("\n")
print("THIS IS THE CONFUSION MATRIX FOR THE RANDOMIZED SEARCH CV")
search_cnf = confusion_matrix(y_test, search_predictions)

search_cnf = pd.DataFrame(search_cnf, columns = ["True-Positive", "True-Negative"],
                                 index = ["Predicted-Positive", "Predicted-Negative"])
print("=============================================================")
search_cnf



THIS IS THE CONFUSION MATRIX FOR THE RANDOMIZED SEARCH CV


Unnamed: 0,True-Positive,True-Negative
Predicted-Positive,619,93
Predicted-Negative,53,1235


In [244]:
new_extra_tree = ExtraTreesClassifier(n_estimators = 1000,
                                      min_samples_split = 2,
                                      min_samples_leaf = 8,
                                      max_features = None
)

In [246]:
new_extra_tree.fit(normalized_train_df, y_train)

ExtraTreesClassifier(max_features=None, min_samples_leaf=8, n_estimators=1000)

In [252]:
new_tree_preditions = new_extra_tree.predict(normalized_test_df)

In [253]:
print(accuracy_score(y_test, new_tree_preditions ))

0.926


In [250]:
# getting the importamnce of each feature in the the extra_tree 
extra_tree_importance = new_extra_tree.feature_importances_

In [251]:
# putting the features and their importance in a dataframe
pd.DataFrame(extra_tree_importance*100, index = normalized_train_df.columns)

Unnamed: 0,0
tau1,13.770188
tau2,14.01124
tau3,13.422072
tau4,13.556138
p1,0.375025
p2,0.54468
p3,0.540515
p4,0.513865
g1,10.237642
g2,10.752015
