# Hamoye Stage C

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("dataset_used/Data_for_UCI_named.csv")
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
# Getting a brief ddescription of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


No missing values

In [4]:
# Based off the instruction 'stab' should be dropped.
data = data.drop(['stab'], axis=1)

In [5]:
# Checking the target variable
data.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [6]:
# Train_split test
from sklearn.model_selection import train_test_split
x = data.drop(['stabf'], axis=1)
y = data['stabf'].map({'stable':0, 'unstable':1})


train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=1)

In [7]:
# Introducing the standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Scaling the train and test x (feature)
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

In [8]:
!pip install xgboost



In [9]:
!pip install lightgbm



## Importing and training required models

In [10]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=1)
xgb_model.fit(train_x_scaled, train_y)
xgb_pred = xgb_model.predict(test_x_scaled)

In [11]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=1)
lgb_model.fit(train_x_scaled, train_y)
lgb_pred = lgb_model.predict(test_x_scaled)

In [12]:
from sklearn.tree import DecisionTreeClassifier

dtc= DecisionTreeClassifier(random_state=1)
dtc.fit(train_x_scaled, train_y)
dtc_p = dtc.predict(test_x_scaled)

In [13]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=1)
rfc.fit(train_x_scaled, train_y)
rfc_p = rfc.predict(test_x_scaled)

In [14]:
# Evaluating the models using the F-Measure metric (f1_score())
from sklearn.metrics import f1_score

accuracy = {
    "Random Forest Classifier": f1_score(test_y, rfc_p),
    "Decision Tree Classifier": f1_score(test_y, dtc_p),
    "XgBoost model": f1_score(test_y, xgb_pred),
    "LightGB model":f1_score(test_y, lgb_pred)
}

In [15]:
print(f"\nThe {max(accuracy, key=accuracy.get)} model has the highest accuracy with value {round(max(accuracy.values()), 3) * 100}%")
accuracy


The XgBoost model model has the highest accuracy with value 95.8%


{'Random Forest Classifier': 0.9455521472392638,
 'Decision Tree Classifier': 0.885692068429238,
 'XgBoost model': 0.9579961464354528,
 'LightGB model': 0.9534077782056218}

In [17]:
# Getting the best parameters
from sklearn.model_selection import RandomizedSearchCV


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

random_search = RandomizedSearchCV(rfc, param_grid, n_iter=10, cv=5, scoring='accuracy')
random_search.fit(train_x_scaled, train_y)

# Access the best hyperparameters
best_params = random_search.best_params_
best_params

{'n_estimators': 200, 'min_samples_split': 5, 'max_depth': None}

In [18]:
importances = dtc.feature_importances_

# Sort indices of importances in descending order
indices = np.argsort(importances)[::-1]

# Print feature ranking
print("Feature ranking:")

for f in range(x.shape[1]):
    print(f"{f + 1}. Feature {indices[f]}: {importances[indices[f]]}")

Feature ranking:
1. Feature 3: 0.13806139457405991
2. Feature 2: 0.12797255007339323
3. Feature 1: 0.12052930834783229
4. Feature 0: 0.11815866770475282
5. Feature 11: 0.11610706297654676
6. Feature 9: 0.10910419054868475
7. Feature 8: 0.10718716998559208
8. Feature 10: 0.09600198929647318
9. Feature 6: 0.018306221179239447
10. Feature 4: 0.016876561945613742
11. Feature 5: 0.016389399555375734
12. Feature 7: 0.015305483812436068


In [19]:
data.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')