In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import ExtraTreeClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#### Read the dataset

In [2]:
df = pd.read_csv('../data/Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
# quick overview the data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [4]:
# check missing value
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [6]:
# vertical split
target = "stabf"
X = df.drop(columns=target)
y = df[target]

#### Label Encoding

In [7]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

#### Split the dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Build Model

#### Random Forest

In [9]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [10]:
# predict the test set
y_pred = model_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(accuracy, 4)}')

Accuracy: 0.9995


#### XGBoost

In [11]:
# build the model
model_xgb = XGBClassifier(random_state=42)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

In [12]:
y_pred = model_xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(accuracy, 4)}')

Accuracy: 0.9995


#### LGBM

In [13]:
model_lgb = LGBMClassifier()
model_lgb.fit(X_train, y_train)

LGBMClassifier()

In [14]:
y_pred = model_lgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(accuracy, 4)}')

Accuracy: 1.0


#### Extra Tree

In [15]:
model_et = ExtraTreeClassifier()

In [16]:
params = {
    'max_features': ['auto' 'log2' , None],
    'min_samples_split': [2,5,7],
    'min_samples_leaf': [4,6,8]
}

In [17]:
search = RandomizedSearchCV(
    estimator = model_et,
    param_distributions=params, 
    n_iter = 10, 
    cv = 5, 
    n_jobs=-1, 
    verbose=1, 
    scoring='accuracy', 
    random_state=42
)

search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py", line 903, in fit
    super().fit(
  File "/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py", line 265, in fit
    raise ValueError("Invalid value for max_features. "
ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py", line 903, in fit
    super().fit(
  File "/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py", line 265, in fit
    raise ValueError("Invalid value for max_fea

RandomizedSearchCV(cv=5, estimator=ExtraTreeClassifier(), n_jobs=-1,
                   param_distributions={'max_features': ['autolog2', None],
                                        'min_samples_leaf': [4, 6, 8],
                                        'min_samples_split': [2, 5, 7]},
                   random_state=42, scoring='accuracy', verbose=1)

In [18]:
print('Best hyperparameters:', search.best_params_)
print('Best score:', search.best_score_)

Best hyperparameters: {'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': None}
Best score: 0.9973750000000001


In [19]:
model_ett = ExtraTreeClassifier(min_samples_split= 2, min_samples_leaf= 8, max_features= None)
model_ett.fit(X_train, y_train)

ExtraTreeClassifier(max_features=None, min_samples_leaf=8)

In [20]:
feature_importance = pd.DataFrame({'feat'  :model_ett.feature_importances_}, index= X_train.columns).sort_values(by= 'feat')
feature_importance

Unnamed: 0,feat
tau1,0.0
p1,0.0
p4,0.0
g1,0.0
g2,0.0
g4,0.0
g3,9.3e-05
p2,9.3e-05
p3,0.000165
tau2,0.001285
