In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
df.drop(columns='stab',inplace=True)

In [4]:
X = df.drop(columns='stabf')
y = df['stabf']

In [5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 2. Which features are the most and least important respectively?

In [7]:
et = ExtraTreesClassifier(random_state=1)
et.fit(X_train_scaled,y_train)
importances = sorted(zip(et.feature_importances_,X_train.columns),reverse=True)
importances[0][1],importances[-1][1]

('tau2', 'p1')

## 9. What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.

In [8]:
from sklearn.metrics import accuracy_score
xgb = XGBClassifier(random_state=1)
xgb.fit(X_train_scaled,y_train)
preds = xgb.predict(X_test_scaled)
round(accuracy_score(y_test,preds),4)



0.9455

## 11. What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [9]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_scaled,y_train)
preds = rf.predict(X_test_scaled)
round(accuracy_score(y_test,preds),4)

0.929

## 13. What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [10]:
lgb = LGBMClassifier(random_state=1)
lgb.fit(X_train_scaled,y_train)
preds = lgb.predict(X_test_scaled)
round(accuracy_score(y_test,preds),4)

0.9395

## 15.  Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [11]:
et = ExtraTreesClassifier(random_state=1)
et.fit(X_train_scaled,y_train)
preds = et.predict(X_test_scaled)
et_accuracy = accuracy_score(y_test,preds)

In [12]:
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
        'n_estimators': [100,200,300,400,500,600,700,800,900,1000],
        'min_samples_split': [2,4,6,8,10],
        'min_samples_leaf': [2,4,6,8,10],
        'max_features': ["auto", "sqrt", "log2"],
    }

et = ExtraTreesClassifier(random_state=1)
rnd_search = RandomizedSearchCV(et, param_distributions=param_distribs, n_iter=10, cv=5, 
                                scoring='accuracy', n_jobs = -1, verbose=1, random_state=1)
rnd_search.fit(X_train_scaled,y_train)
preds = rnd_search.predict(X_test_scaled)
rnd_accuracy = accuracy_score(y_test,preds)
rnd_accuracy>et_accuracy

Fitting 5 folds for each of 10 candidates, totalling 50 fits


True

## 18. What are the best hyperparameters from the randomized search CV?

In [13]:
rnd_search.best_params_

{'n_estimators': 300,
 'min_samples_split': 4,
 'min_samples_leaf': 2,
 'max_features': 'sqrt'}