In [16]:
import numpy as np
import pandas 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, matthews_corrcoef, confusion_matrix, classification_report, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from bisect import bisect_left
from tqdm import notebook
import math

In [17]:
def feature_selection(X_train,X_test,vector):
    index=[]
    col_index=0
    assert len(vector)!=62
    for i in vector:
        if i==1:
            index.append(col_index)
        col_index+=1
    if not index:
        index.append(0)
    X_train=X_train[:,index]
    X_test=X_test[:,index]
    return X_train, X_test

In [18]:
def G_measure(y_true,y_pred): #OK
    arr = confusion_matrix(y_true,y_pred)
    pd = arr[1][1]/(arr[1][1]+arr[1][0])
    pf = arr[0][1]/(arr[0][0]+arr[0][1])
    fallout = 1-pf
    g_measure = 2*((pd*fallout)/(pd+fallout))
    return g_measure

In [19]:
# def confusion_matrix(y_true,y_pred): #OK
#     arr = confusion_matrix(y_true,y_pred)
#     return arr

In [20]:
def LIR(X_true, y_true, y_pred):
    TN=FP=FN=TP=0
    for i in range(len(y_true)):
        if y_true[i]==False and y_pred[i]==False:
            TN+=X_true[i][25]
        elif y_true[i]==False and y_pred[i]==True:
            FP+=X_true[i][25]
        elif y_true[i]==True and y_pred[i]==False:
            FN+=X_true[i][25]
        else :
            TP+=X_true[i][25]
    LI = (TP+FP)/(TP+TN+FP+FN)
    PV = (TP)/(TP+FN)
    LIR = (PV-LI)/PV
    return LIR

In [21]:
def FIR(y_true,y_pred):
    arr = confusion_matrix(y_true,y_pred)
    pd = arr[1][1]/(arr[1][1]+arr[1][0])
    fi = (arr[0][1]+arr[1][1])/(arr[0][0]+arr[0][1]+arr[1][1]+arr[1][0])
    FIR = (pd-fi)/pd
    return FIR

In [22]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
def Norm(X_train,X_test,norm):
    if norm==0: # None
        return X_train,X_test
    elif norm==1:#MinMax (all)
        scaler=MinMaxScaler()
        scaler.fit(np.concatenate((X_train, X_test), axis=0))
        X_train_norm=scaler.transform(X_train)
        X_test_norm=scaler.transform(X_test)
        return X_train_norm, X_test_norm
    elif norm==2:#z-score (all)
        scaler=StandardScaler()
        scaler.fit(np.concatenate((X_train, X_test), axis=0))
        X_train_norm=scaler.transform(X_train)
        X_test_norm=scaler.transform(X_test)
        return X_train_norm, X_test_norm
    elif norm==3:#z-score (source distribution)
        scaler=StandardScaler()
        scaler.fit(X_train)
        X_train_norm=scaler.transform(X_train)
        X_test_norm=scaler.transform(X_test)
        return X_train_norm, X_test_norm
    elif norm==4: #z-score (target distribution)
        scaler=StandardScaler()
        scaler.fit(X_test)
        X_train_norm=scaler.transform(X_train)
        X_test_norm=scaler.transform(X_test)
        return X_train_norm, X_test_norm
    elif norm==5: #logarithmic filtering
        X_train_norm=np.log10(np.where(X_train<0.001,0.001,X_train))
        X_test_norm=np.log10(np.where(X_test<0.001,0.001,X_test))
        return X_train_norm, X_test_norm

In [23]:
def classifier(X_train,X_test,y_train,y_test,vector):
    #Vector 0 : Criterion
    print(vector[:(len(vector)-X_train.shape[1])])
    X_true=X_test
    if vector[0]==0:
        criterion = "gini"
    else:
        criterion = "entropy"

    #Vector 0 : Criterion
    if vector[1]==0:
        split = "best"
    else:
        split = "random"

    #Vectro 6: Class Weight
    balance = balance = {True:1,False:vector[8]}

    #Model Construct
    clf = DecisionTreeClassifier(criterion=criterion,
                                 splitter=split,
                                 max_depth=vector[2],
                                 min_samples_split=vector[3],
                                 min_samples_leaf=vector[4],
                                 min_weight_fraction_leaf=vector[5],
                                 ccp_alpha=vector[6],
                                 class_weight=balance)

    #Feature Selection
    X_train,X_test=feature_selection(X_train,
                                     X_test,
                                     vector[(len(vector)-X_train.shape[1]):])

    #Normalization
    X_train,X_test=Norm(X_train,X_test,vector[7])

    #learning
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)

    g_score = G_measure(y_test,preds)
    lir_score = LIR(X_true,y_test,preds)
    fir_score = FIR(y_test,preds)
    arr= confusion_matrix(y_test,preds)
    pd = arr[1][1]/(arr[1][1]+arr[1][0])
    pf = arr[0][1]/(arr[0][0]+arr[0][1])
    return g_score, lir_score, fir_score, pd, pf

In [24]:
from pyharmonysearch import ObjectiveFunctionInterface, harmony_search
from math import pow
import random
from multiprocessing import cpu_count

"""
 0 : Criterion                |      {"gini","entropy"}
 1 : Splitter                 |      {"best","random"}
 2 : Max Depth                |      [2, 40]
 3 : Min Samples Split        |      [2. 100]
 4 : Min Samples Leaf         |      [1. 200]
 5 : Min Weight Fraction Leaf |      [0.001, 0.1]
 6 : CCP Alpha                |      [0.00001, 0.004] 
 7 : Normalization            |      {"None", "Min-Max", "z-score","z-score(source)", "z-score(target)","logarithmic filtering"}
 8 : Class Weight             |      [0.01, 1]
 9 : Feature selection        |      {"None","Select"}
"""

class ObjectiveFunction(ObjectiveFunctionInterface):
    def __init__(self,X_train,X_test,y_train,y_test):
        #define data
        self.X_train=X_train
        self.X_test=X_test
        self.y_train=y_train
        self.y_test=y_test
        self.lir=[]
        self.fir=[]
        self.g_measure=[]
        self.pd=[]
        self.pf=[]
        
        #define parameters range 
        self._lower_bounds = [None,None,None,None,None,0.001,0.00001,None,0.01]   # criterion / Max Leaf Nodes / Max Depth / Min Samples Split / Random State / Class Weight
        self._upper_bounds = [None,None,None,None,None,0.1,0.004,None,1]
        self._discrete_values = [[x for x in range(0, 2)],
                                 [x for x in range(0, 2)],
                                 [x for x in range(2, 41)],
                                 [x for x in range(2, 100)],
                                 [x for x in range(1, 200)],
                                 None,
                                 None,                                
                                 [x for x in range(0, 6)],
                                 None]
        
        for i in range(X_train.shape[1]): # Feature Weight
            self._lower_bounds.append(None)
            self._upper_bounds.append(None)
            self._discrete_values.append([x for x in range(0, 2)])
            
        self._variable = []
        for i in range(len(self._lower_bounds)):
            self._variable.append(True)

        # define all input parameters
        self._maximize = True  # do we maximize or minimize?
        self._max_imp = 1000  # maximum number of improvisations 1000
        self._hms = 100  # harmony memory size 100
        self._hmcr = 0.8  # harmony memory considering rate 0.8
        self._par = 0.4  # pitch adjusting rate 0.4
        self._mpap = 0.25  # maximum pitch adjustment proportion (new parameter defined in pitch_adjustment()) - used for continuous variables only
        self._mpai = 10  # maximum pitch adjustment index (also defined in pitch_adjustment()) - used for discrete variables only
        self._random_seed = 8675309  # optional random seed for reproducible results
    
    def get_fitness(self, vector):
        # note tune
        
        #Vector 0 : Criterion
        if vector[0]==0:
            criterion = "gini"
        else:
            criterion = "entropy"
        
        #Vector 0 : Criterion
        if vector[1]==0:
            split = "best"
        else:
            split = "random"
        
        #Vectro 6: Class Weight
        balance = balance = {True:1,False:vector[8]}

        #Model Construct
        clf = DecisionTreeClassifier(criterion=criterion,
                                     splitter=split,
                                     max_depth=vector[2],
                                     min_samples_split=vector[3],
                                     min_samples_leaf=vector[4],
                                     min_weight_fraction_leaf=vector[5],
                                     ccp_alpha=vector[6],
                                     class_weight=balance)
        g=[]
        for tr_index, vali_index in kfold.split(self.X_train,self.y_train):
            X_tr, X_vali = self.X_train[tr_index], self.X_train[vali_index]
            y_tr, y_vali = self.y_train[tr_index], self.y_train[vali_index]
            #Feature Selection
            X_tr,X_vali=feature_selection(X_tr,X_vali,vector[(len(self._variable)-self.X_train.shape[1]):])

            #Normalization
            X_tr,X_vali=Norm(X_tr,X_vali,vector[7])

            #learning
            clf.fit(X_tr, y_tr)
            preds = clf.predict(X_vali)
            score = G_measure(y_vali,preds)
            g.append(score)
        g_score=(sum(g)/len(g))
        return g_score

    def get_value(self, i, j=None):
        if self.is_discrete(i):
            if j:
                return self._discrete_values[i][j]
            return self._discrete_values[i][random.randint(0, len(self._discrete_values[i]) - 1)]
        if i==6:
            return round(random.uniform(self._lower_bounds[i], self._upper_bounds[i]),4)
        return round(random.uniform(self._lower_bounds[i], self._upper_bounds[i]),2)

    def get_lower_bound(self, i):
        return self._lower_bounds[i]

    def get_upper_bound(self, i):
        return self._upper_bounds[i]
    
    def get_num_discrete_values(self, i):
        if self.is_discrete(i):
            return len(self._discrete_values[i])
        return float('+inf')

    def get_index(self, i, v):
        """
            Because self.discrete_values is in sorted order, we can use binary search.
        """
        return ObjectiveFunction.binary_search(self._discrete_values[i], v)

    @staticmethod
    def binary_search(a, x):
        """
            Code courtesy Python bisect module: http://docs.python.org/2/library/bisect.html#searching-sorted-lists
        """
        i = bisect_left(a, x)
        if i != len(a) and a[i] == x:
            return i
        raise ValueError

    def is_variable(self, i):
        return self._variable[i]

    def is_discrete(self, i):
        return self._discrete_values[i] is not None

    def get_num_parameters(self):
        return len(self._lower_bounds)

    def use_random_seed(self):
        return hasattr(self, '_random_seed') and self._random_seed

    def get_random_seed(self):
        return self._random_seed

    def get_max_imp(self):
        return self._max_imp

    def get_hmcr(self):
        return self._hmcr

    def get_par(self):
        return self._par

    def get_hms(self):
        return self._hms

    def get_mpai(self):
        return self._mpai

    def get_mpap(self):
        return self._mpap

    def maximize(self):
        return self._maximize
    
    def get_lir(self):
        index=self.get_bestindex()
        return self.lir[index]
    
    def get_fir(self):
        print("b")
#         index=self.get_bestindex()
#         return self.fir[index]
    
    def get_pd_pf(self,y_true,y_pred):
        arr = confusion_matrix(y_true,y_pred)
        pd = arr[1][1]/(arr[1][1]+arr[1][0])
        pf = arr[0][1]/(arr[0][0]+arr[0][1])
        return pd, pf 
    
    def get_bestindex(self):
        return self.g_measure.index(max(self.g_measure))
        

In [25]:
df = pandas.read_csv('../Dataset/AEEEM/EQ.csv')

In [26]:
X = np.array(df.loc[:,'ck_oo_numberOfPrivateMethods':'LDHH_numberOfMethods'])
y = np.array(df['class'])

In [27]:
from sklearn.model_selection import StratifiedKFold
num_split=10
kf = StratifiedKFold(n_splits=num_split,random_state=1,shuffle=False)
kfold = StratifiedKFold(n_splits=(num_split-1),shuffle=False)



In [28]:
g_measure=[]
best_harmony=[]
col = ["Criterion","Splitter","Max Depth","Min Samples Split","Min Samples Leaf","Min Weight Fraction Leaf","CCP Alpha","Normalization",
      "Class Weight(False)"]
for i in range(X.shape[1]):
    col.append(i+1)
for train_index, test_index in notebook.tqdm(kf.split(X,y)):
    print('-------------------------------------------------------------------------------------------------')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    obj_fun = ObjectiveFunction(X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test)
    num_processes = 1  # use number of logical CPUs
    num_iterations = 5 # each process does 5 iterations
    results = harmony_search(obj_fun, num_processes, num_iterations)
    index = (len(results.best_harmony)-X_train.shape[1])
    print('Elapsed time: {}\nBest harmony: {}\nNormalization: {}\nClass Weight: {}\nBest fitness: {}'.format(results.elapsed_time, results.best_harmony[:7], results.best_harmony[7],results.best_harmony[8],results.best_fitness))
    g_measure.append(results.best_fitness)
    best_harmony.append(results.best_harmony)
    g,lir,fir,pd,pf = classifier(X_train,X_test,y_train,y_test,results.best_harmony)
    print('G measure: {}\nLIR: {}\nFIR: {}\nPD: {}\nPF: {}'.format(g,lir,fir,pd,pf))

print("G_measure : {}".format(sum(g_measure)/len(g_measure)))
# find best harmony

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

-------------------------------------------------------------------------------------------------
Elapsed time: 0:01:37.563002
Best harmony: [0, 0, 3, 99, 12, 0.02, 0.0029]
Normalization: 5
Class Weight: 0.5692229867474252
Best fitness: 0.7836762838306578
[0, 0, 3, 99, 12, 0.02, 0.0029, 5, 0.5692229867474252]
G measure: 0.8275862068965517
LIR: 0.08253707797161719
FIR: 0.441919191919192
PD: 0.9230769230769231
PF: 0.25
-------------------------------------------------------------------------------------------------
Elapsed time: 0:01:39.765502
Best harmony: [0, 0, 4, 13, 3, 0.03, 0.0030968856302910595]
Normalization: 4
Class Weight: 0.64
Best fitness: 0.7980812984643497
[0, 0, 4, 13, 3, 0.03, 0.0030968856302910595, 4, 0.64]
G measure: 0.641399416909621
LIR: 0.14658483504440523
FIR: 0.2515151515151515
PD: 0.7692307692307693
PF: 0.45
-------------------------------------------------------------------------------------------------
Elapsed time: 0:01:38.971223
Best harmony: [1, 0, 22, 18, 16

In [29]:
# df_best_h= pandas.DataFrame(best_harmony,columns=col)
# df_best_g= pandas.DataFrame(g_measure,columns=["G Measure"])
# df_best_harmony = pandas.concat([df_best_h,df_best_g],axis=1)

In [30]:
# import os
# if not os.path.exists('Best Harmony_AEEEM.csv'):
#     df_best_harmony.to_csv('Best Harmony_AEEEM.csv', index=False, mode='w', encoding='utf-8-sig')
# else:
#     df_best_harmony.to_csv('Best Harmony_AEEEM.csv', index=False, mode='a', encoding='utf-8-sig', header=False)