In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# df = pd.read_excel("Data_for_UCI_named.xlsx")
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv")

In [3]:
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [4]:
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [5]:
df.drop(columns = 'stab', inplace=True)
x = df.drop(columns = 'stabf')
y = df['stabf']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x,y, 
                                                    train_size= 0.8,
                                                   random_state=1)
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

ntrain_df = scaler.fit_transform(x_train)

ntest_df = scaler.transform(x_test)

In [9]:
from sklearn.metrics import recall_score, precision_score, accuracy_score, classification_report, confusion_matrix

In [10]:
from sklearn.ensemble import ExtraTreesClassifier

tree = ExtraTreesClassifier(random_state = 1)

tree.fit(ntrain_df, y_train)

tree_pred = tree.predict(ntest_df)

print(classification_report(y_test, tree_pred, digits = 4))

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



In [11]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state = 1)

tree.fit(ntrain_df, y_train)

tree_pred = tree.predict(ntest_df)

print(classification_report(y_test, tree_pred, digits = 4))

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



In [12]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state = 1)

xgb.fit(ntrain_df, y_train)

xgb_pred = xgb.predict(ntest_df)

print(classification_report(y_test, xgb_pred, digits = 4))

              precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000



In [13]:
from lightgbm import LGBMClassifier

lbm = LGBMClassifier(random_state = 1)

lbm.fit(ntrain_df, y_train)

lbm_pred = lbm.predict(ntest_df)

print(classification_report(y_test, lbm_pred, digits = 4))

              precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000



In [14]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

clf = RandomizedSearchCV(ExtraTreesClassifier(random_state=1), hyperparameter_grid, random_state = 1)
clf.fit(ntrain_df, y_train)
print(classification_report(y_test, clf.predict(ntest_df), digits = 4))
print("best parameters: ", clf.best_params_)

              precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000

best parameters:  {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


In [16]:
tree = ExtraTreesClassifier(random_state = 1)

n_estimators = [1000]
min_samples_split = [2]
min_samples_leaf = [8]
max_features = [None] 

hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}
tree.fit(ntrain_df, y_train)

tree_pred = tree.predict(ntest_df)

print(classification_report(y_test, tree_pred, digits = 4))

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000

