In [1]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
    
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
def cost(y_true,y_pred ,positive_label = 'pos', negative_label='neg'):
    
    data=pd.concat([y_true,y_pred],axis=1)
    wrong_negative_labels = 0
    wrong_negative_labels_cost =0
    
    wrong_positive_labels = 0
    wrong_positive_labels_cost = 0
    
    for i,n in data.iterrows():        
        if data[data.columns[1]][i] != data[data.columns[0]][i]:
            
            if data[data.columns[1]][i] == 'pos':
                wrong_positive_labels = wrong_positive_labels+1
                wrong_positive_labels_cost = wrong_positive_labels_cost+10
                
            if data[data.columns[1]][i] == 'neg':
                wrong_negative_labels = wrong_negative_labels+1
                wrong_negative_labels_cost = wrong_negative_labels_cost+ 500
            
    tot_cost = wrong_negative_labels_cost+wrong_positive_labels_cost

    # print('wrongly labelled positives =', wrong_positive_labels)
    # print('wrongly labelled negatives =', wrong_negative_labels)  
    # print('total cost of wrongly labelling =' , tot_cost)
    
    return tot_cost
    

In [3]:
def preprocess(df):

    X = df.drop('class',axis=1)

    X.replace('na',np.nan,inplace=True)
    X = X[X.columns].apply(pd.to_numeric, errors='coerce')

    return X

In [4]:
def impute(df, strat = 'mean'):
    
    X = df
    imp = SimpleImputer(missing_values=np.nan, strategy=strat)
    X = pd.DataFrame(imp.fit_transform(X),columns=X.columns)
    return X


In [5]:
def get_top(df,y,n):
    
    # Use RFE algorithm from sklearn to perform feature selection
    feature_select = RFE(estimator=RandomForestClassifier(n_estimators=150,max_depth=5,random_state=1),
                         n_features_to_select=n,
                         verbose=5)
    feature_select.fit(df,y)

    # support_ gives an array of True/False for each feature where True signifies that the feature is selected
    top_f = [ c for i,c in enumerate(df.columns.tolist()) if feature_select.support_[i] ]   
    return top_f

In [6]:
def balance_data(df,label):

    over = SMOTE(sampling_strategy=0.3)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    df, label = pipeline.fit_resample(df, label)
    return df, label

In [7]:
def scale_data_minmax(df):

    min_max = MinMaxScaler()
    df = pd.DataFrame( data = min_max.fit_transform(df) , columns = df.columns )
    return df , min_max


In [8]:
def scale_data_standard(df):

    standard_scale = StandardScaler()
    df = pd.DataFrame( data = standard_scale.fit_transform(df) , columns = df.columns )
    return df , standard_scale
    

In [9]:
def LGBM(X_train,y_train,X_test,y_test):
    
    import lightgbm as lgb
    from sklearn import metrics

    clf = lgb.LGBMClassifier()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    y_pred = pd.DataFrame(y_pred)
    tot_cost = cost(y_test,y_pred)

    return tot_cost

In [10]:
def gradient(X_train,y_train,X_test,y_test):

    from sklearn.ensemble import GradientBoostingClassifier
    clf = GradientBoostingClassifier()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    y_pred = pd.DataFrame(y_pred)
    tot_cost = cost(y_test,y_pred)

    return tot_cost