In [103]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import math
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score, train_test_split

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from hyperopt import Trials, STATUS_OK, tpe, hp, fmin, STATUS_FAIL, space_eval

import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [3]:
def load_data():

    column_names = ['Temperature', 
                   'RelativeLuminosity',
                   'RelativeRadius',
                   'ApparentMagnitude',
                   'Color',
                   'SpectralClass',
                   'Type']
    return pd.read_csv('Stars.csv', names=column_names, header=0)

df = load_data()
df.head()

Unnamed: 0,Temperature,RelativeLuminosity,RelativeRadius,ApparentMagnitude,Color,SpectralClass,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


## Splitting the data for training and validation

In [68]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Type'), df['Type'],
                                                    test_size=0.30, random_state=1,
                                                    stratify= df['Type'])

## Constructing Custom Transformer

In [5]:
class NumericalFeatureCleaner(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__(self):
        self._scalar = RobustScaler()
        return None

        
    # Return self nothing else to do here
    def fit(self, X, y =None):
        X = self._scalar.fit(X)
        return self

    
    def remove_outliers(self, X):
        for col in X.columns:
            Q1 = df[col].quantile(0.2)
            Q3 = df[col].quantile(0.8)
            IQR = Q3 - Q1
            # Adjusting outliers with their percentile values

            low = Q1 - 1.5 * IQR
            high = Q3 + 1.5 * IQR

            X[col] = np.where(X[col] < low, low, X[col])
            X[col] = np.where(X[col] > high, high, X[col])

        return X


        # Method calls several cleaner functions
    def transform(self, X, y = None):
       
        # Removing Outliers
        X = self.remove_outliers(X)

        
        # Scaling Features
        X = pd.DataFrame(self._scalar.transform(X), columns=X.columns, index=X.index)
        
        return X

In [6]:
class CategoricalFeatureCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    
    def fit(self, X, y=None):
        return self
    
    
    def get_features(self):
        return self.column_names
    
    
    def catToDummy(self, X):
        X = pd.get_dummies(X)
        return X
    
    
    def __formatColor(self, X):
        X = re.sub("[^A-Za-z]", "", x.lower())
        return X
    
    
    def transform(self, X, y=None):
        
        X = self.catToDummy(X)
        
        return X
        

In [7]:
numerical_columns = ['Temperature', 'RelativeLuminosity', 'RelativeRadius', 'ApparentMagnitude']
categorical_columns = ['SpectralClass']

transformer = ColumnTransformer(transformers=[
    ("numerical_transformer", NumericalFeatureCleaner(), numerical_columns),
    ("categorical_transformer", CategoricalFeatureCleaner(), categorical_columns)
    ], remainder='drop')


In [8]:
model = Pipeline(steps=[("pre-processor", transformer),
                       ("model", KNeighborsClassifier(n_neighbors=10, leaf_size=10))])



## Defining Hyperopts- Objective

In [149]:
# Defining Objective function whose loss we have to minimize
def objective(args):

    pipe = Pipeline(steps=[
        ('model', args['model'])
    ])
    
    pipe.set_params(**args['params'])

    score = cross_val_score(pipe, transformer.fit_transform(X_train), y_train, cv=5, n_jobs=-1, error_score=0.99)
    return {'loss': 1 - np.median(score), 'status': STATUS_OK}

## Defining Hyperopts- Search Space

In [136]:
# Defining Search Space
space = hp.choice('classifiers', [
    {
    'model':KNeighborsClassifier(),
    'params':{
        'model__n_neighbors': hp.choice('knc.n_neighbors', range(2,10)),
        'model__algorithm': hp.choice('knc.algorithm',
                                      ['auto', 'ball_tree', 'kd_tree']),
        'model__metric': hp.choice('knc.metric', ['chebyshev', 'minkowski'])
    }
    },
    {
    'model':SVC(),
    'params':{
        'model__C': hp.choice('C', np.arange(0.005,1.0,0.01)),
        'model__kernel': hp.choice('kernel',['linear', 'rbf', 'sigmoid']),
        'model__degree':hp.choice('degree',[2,3,4]),
        'model__gamma': hp.uniform('gamma',0.001,1000)
    }
    },

    {
    'model': LogisticRegression(verbose=0),
    'params': {
        'model__penalty': hp.choice('lr.penalty', ['l2']),
        'model__C': hp.choice('lr.C', np.arange(0.005,1.0,0.01))

    }
    },
    {
        'model': QuadraticDiscriminantAnalysis(),
        'params': {}
    }
])

## Defining Hyperopts- Trail function

In [137]:
# Putting it together
trials = Trials()

best_classifier = fmin(objective, space, algo=tpe.suggest,
                       max_evals=10, trials=trials)

best_params = space_eval(space, best_classifier)

[0.14705882 0.14705882 0.17647059 0.18181818 0.18181818]
[0.97058824 0.94117647 1.         0.93939394 1.        ]
[0.94117647 0.94117647 0.94117647 0.96969697 0.96969697]                        
[0.26470588 0.32352941 0.32352941 0.33333333 0.33333333]                         
[0.88235294 0.88235294 0.97058824 0.87878788 0.90909091]                         
[1.         0.97058824 1.         1.         1.        ]                         
[0.14705882 0.14705882 0.17647059 0.18181818 0.18181818]                         
[0.35294118 0.35294118 0.5        0.42424242 0.42424242]         
[1.         0.94117647 1.         0.96969697 0.93939394]         
[0.14705882 0.14705882 0.17647059 0.18181818 0.18181818]         
100%|██████████| 10/10 [00:00<00:00, 26.51trial/s, best loss: 0.0]


In [None]:
## Getting the best Model

In [138]:
best_params['model']

SVC(C=0.15499999999999997, degree=2, gamma=391.91004869541047)

## Model Validation with Classification Report

#### Teh objective is to demonstrate Hyperopts :) Yay! it works

In [146]:
X_train_tf = transformer.fit_transform(X_train, y_train)
X_test_tf = transformer.transform(X_test)

model = best_params['model'].fit(X_train_tf, y_train)
y_pred = model.predict(X_train_tf)

# TRAIN -- classification report and store predictions/ probabilities
print('Training Classification Report for estimator: ',
      str(model).split('(')[0])
print('\n', classification_report(y_train, y_pred))

Training Classification Report for estimator:  SVC

               precision    recall  f1-score   support

           0       1.00      0.75      0.86        28
           1       1.00      0.93      0.96        28
           2       1.00      1.00      1.00        28
           3       1.00      1.00      1.00        28
           4       0.76      1.00      0.86        28
           5       1.00      1.00      1.00        28

    accuracy                           0.95       168
   macro avg       0.96      0.95      0.95       168
weighted avg       0.96      0.95      0.95       168



In [147]:
X_test_tf = transformer.transform(X_test)

y_pred = model.predict(X_test_tf)

# TEST -- classification report and store predictions/ probabilities
print('Training Classification Report for estimator: ',
      str(model).split('(')[0])
print('\n', classification_report(y_test, y_pred))

Training Classification Report for estimator:  SVC

               precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       1.00      0.75      0.86        12
           2       1.00      0.42      0.59        12
           3       1.00      0.33      0.50        12
           4       0.28      1.00      0.44        12
           5       0.00      0.00      0.00        12

    accuracy                           0.57        72
   macro avg       0.71      0.57      0.56        72
weighted avg       0.71      0.57      0.56        72

