In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn import utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, LeaveOneOut, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesClassifier
import lightgbm as ltb

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HDSC/Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [5]:
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [6]:
encoder = LabelEncoder()
df['stabf'] = encoder.fit_transform(df['stabf'])

In [7]:
X = df.drop(columns = ['stabf'], axis = 1)
y = df['stabf']

In [8]:
scaler = MinMaxScaler()
X_scale = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size = 0.3, random_state = 0)

In [10]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
accuracy_score(y_true = y_test, y_pred = clf.predict(X_test))

0.9996666666666667

In [11]:
xgboost = GradientBoostingClassifier()
xgboost.fit(X_train, y_train)
y_predict = xgboost.predict(X_test)
accuracy_score(y_true = y_test, y_pred = xgboost.predict(X_test))

0.9996666666666667

In [12]:
lgmb = ltb.LGBMClassifier()
lgmb.fit(X_train, y_train)
y_predict = lgmb.predict(X_test)
accuracy_score(y_true = y_test, y_pred = lgmb.predict(X_test))

1.0

In [13]:
params = {'n_estimators' : [100, 300, 500, 1000],
          'min_samples_split' : [2, 5 , 7],
          'min_samples_leaf' : [4, 6, 8],
          'max_features' : [None, 'auto', 'log2']}

In [14]:
%%time
etc = ExtraTreesClassifier()
rf = RandomizedSearchCV(etc, params, 
                        random_state = 1,
                        cv = 5,
                        n_iter = 10,
                        n_jobs = -1,
                        verbose = 1,
                        scoring = 'accuracy')

CPU times: user 169 µs, sys: 0 ns, total: 169 µs
Wall time: 201 µs


In [15]:
rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=ExtraTreesClassifier(), n_jobs=-1,
                   param_distributions={'max_features': [None, 'auto', 'log2'],
                                        'min_samples_leaf': [4, 6, 8],
                                        'min_samples_split': [2, 5, 7],
                                        'n_estimators': [100, 300, 500, 1000]},
                   random_state=1, scoring='accuracy', verbose=1)

In [16]:
rf.best_params_

{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'max_features': 'log2'}

In [17]:
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)
y_predict = etc.predict(X_test)
accuracy_score(y_true = y_test, y_pred = etc.predict(X_test))

0.9906666666666667

In [21]:
pd.DataFrame(clf.feature_importances_, columns = ["Imp"], index = X.columns)

Unnamed: 0,Imp
tau1,0.042953
tau2,0.04258
tau3,0.032633
tau4,0.033563
p1,0.002611
p2,0.003019
p3,0.003058
p4,0.002462
g1,0.016032
g2,0.019604
