In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from numpy import mean, std
from sklearn.model_selection import cross_val_score,GridSearchCV
#from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer, f1_score, recall_score,plot_confusion_matrix
import sklearn
from sklearn.datasets import make_gaussian_quantiles
from sklearn.svm import SVC

In [2]:
Minority_Samples = 250   # number of Minority Samples
Majority_Samples = 2000  # number of Majority Samples
Separation = 10          # Degree of Seperation
Overlap = 50         # percentage of Overlap

GraphMinLimit = -10
GraphMaxLimit = 20

MajorCov1 = [[8,0],[0,8]] # Circular
MinorCov1 = [[10,0],[0,10]] # Circular

MajorCovFlat = [[8,0],[0,1]] # Thicker Line - Flat
MinorCovInclined = [[8,7],[7,8]] # Thicker line - Inclined to 45 degrees

MajorCov2 = [[6,6],[6,6]] # Slim line inclined to 45 degrees
MinorCov2 = [[4,4],[4,4]]# Slim line inclined to 45 degrees

MajorCov3 = [[4,0],[0,1]] # Thicker Line - Parallel to x-axis
MinorCov3 = [[6,0],[0,1]] # Thicker Line - Parallel to x-axis

MajorCov4 = [[8,7],[7,8]] # Thicker line - Inclined to 45 degrees
MinorCov4 = [[8,0],[0,1]] # Thicker Line - Flat

MajorCov5 = [[1,0],[0,4]] # Thicker line - Parallel to y-axis
MinorCov5 = [[1,0],[0,6]] # Thicker Line - Parallel to y-axis

In [3]:
def GenerateDataset(Minority,Majority, Separation, Overlap,MajorCov,MinorCov):
    np.random.seed(0)
    majority_mean = [0, 0]
    majority_cov = MajorCov
    minority_cov = MinorCov
    minority = Minority # Number of points for minority class
    majority = Majority # Number of points for majority class
    separation = Separation
    overlap = Overlap
    minority_mean = 0.5 + separation * (100 - overlap) / 100 # Formula to implement control the overlapping
    train_data = np.c_[
    np.r_[
    np.random.multivariate_normal(majority_mean, majority_cov, majority),
    np.random.multivariate_normal([minority_mean, 0], minority_cov, minority)],
    np.r_[np.zeros((majority, 1), dtype=int), np.ones((minority, 1), dtype=int)],]
    df= pd.DataFrame(train_data,columns = ['Feature1','Feature2','Results'])
    return df

In [4]:
def ImbalancePlot(df: pd.DataFrame, x1: str, x2: str, y: str, MinLimit: int, MaxLimit: int, title: str = ''
                       ,save: bool = False,c='YlOrRd', figname='figure.png'):
    plt.figure(figsize=(6, 4))
    plt.scatter(x=df[df[y] == 0][x1], y=df[df[y] == 0][x2], label='0', c='yellow',edgecolors='black')
    plt.scatter(x=df[df[y] == 1][x1], y=df[df[y] == 1][x2], label='1',c='gray',edgecolors='black')
    plt.title(title, fontsize=10)
    plt.xlabel(x1, fontsize = 10)
    plt.ylabel(x2, fontsize = 10)
    plt.xlim(MinLimit, MaxLimit)
    plt.ylim(MinLimit, MaxLimit)
    plt.legend()
    if save:
        plt.savefig(figname, dpi=300, bbox_inches='tight', pad_inches=0)
    plt.show()

In [5]:
def Oversample_Plot(x: str, y: str, MinLimit: int, MaxLimit: int, title: str = '', save: bool = False, figname='figure.png'):
    plt.figure(figsize=(6, 4))
    #plt.scatter(x= [y == 0][np.array(x)[:, 0]], y= [y == 0][np.array(x)[:, 1]], label='y = 0')
    #plt.scatter(x= [y == 0][np.array(x)[:, 0]], y= [y == 0][np.array(x)[:, 1]], label='y = 1')
    
    plt.scatter(np.array(x)[:, 0], np.array(x)[:, 1], c=[y == 0], label='0',edgecolors='black')
    plt.scatter(np.array(x)[:, 0], np.array(x)[:, 1], c=[y == 1], label='1',edgecolors='black')
    #plt.scatter(x=df[df[y] == 1][x1], y=df[df[y] == 1][x2], label='y = 1')
    plt.title(title, fontsize=10)
    plt.xlabel(x, fontsize = 10)
    plt.ylabel(y, fontsize = 10)
    plt.xlim(MinLimit, MaxLimit)
    plt.ylim(MinLimit, MaxLimit)
    plt.legend()
    if save:
        plt.savefig(figname, dpi=300, bbox_inches='tight', pad_inches=0)
    plt.show()

In [6]:
#Function to Visualize Plots

def Genrated_points_Proposed_Algo(df: pd.DataFrame, x1: str, x2: str, y: str, MinLimit: int, MaxLimit: int, title: str = ''
                       ,save: bool = False,c='YlOrRd', figname='figure.png'):
    plt.figure(figsize=(6, 4))
    plt.scatter(x=df[df[y] == 0][x1], y=df[df[y] == 0][x2], c='yellow',edgecolors='black')
    plt.scatter(x=df[df[y] == 1][x1], y=df[df[y] == 1][x2],c='gray',edgecolors='black')
    plt.scatter(x=df[df[y] == 2][x1], y=df[df[y] == 2][x2],c='red',edgecolors='black')
    plt.title(title, fontsize=10)
    plt.xlabel(x1, fontsize = 10)
    plt.ylabel(x2, fontsize = 10)
    plt.xlim(MinLimit, MaxLimit)
    plt.ylim(MinLimit, MaxLimit)
    plt.legend()
    if save:
        plt.savefig(figname, dpi=300, bbox_inches='tight', pad_inches=0)
    plt.show()

In [7]:
#Graph with different colour for oversample samples

def get_Graph_Values(X_prev,y_prev,X_algo,y_algo):
    X_test1=X_prev.iloc[y_prev[y_prev==1.0].index]['Feature1']
    X_test1=pd.DataFrame(X_test1)
    X_test1.sort_index(inplace=True)
    for i in range (len(X_test1)):
        checking=y_algo[round(X_algo['Feature1'],6)==round(X_test1.iloc[i][0],6)]
        if (len(checking)==1):
            if (y_algo[round(X_algo['Feature1'],6)==round(X_test1.iloc[i][0],6)].values[0])==1.0:
                index_test=y_algo[round(X_algo['Feature1'],6)==round(X_test1.iloc[i][0],6)].index[0]
                y_algo[index_test]=2.0
    return y_algo

In [8]:
class MainClass(): # change class name to main class
    def generate_clf(input_C, input_gamma, X1, y1, X2, y2):
        classifier = SVC(kernel='rbf', gamma=input_gamma, C=input_C)
        classifier.fit(X1, y1)
        ypred = classifier.predict(X2)
        print("Training Accuracy = {}".format(accuracy_score(y2, ypred)))
        return classifier
    
    def make_f1_scorer():
        f1 = make_scorer(f1_score, average='macro')
        return f1
    
    def svc_param_selection(X_train, y_train):
        Cs = [1] #[0.1,1,10,50,100]
        gammas = [500]
        #scoring = make_scorer(recall_score, average='weighted',pos_label=1)
        scoring = {'AUCe': 'roc_auc', 'Accuracy': 'accuracy', 'prec':  'precision', 'recall': 'recall', 'f1s': 'f1','spec':make_scorer(recall_score,pos_label=1)}
        param_grid = {'C': Cs, 'gamma': gammas}
        grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, scoring=scoring, refit='recall',return_train_score = True)
        grid_search.fit(X_train, y_train)
        return grid_search.best_params_
    
    def plot_svm_decision_boundry(X,y, model):
        svm_mod= model.fit(X,y)
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        h = (x_max - x_min)/100
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        plt.subplot(1, 1, 1)
        z = svm_mod.predict(np.c_[xx.ravel(), yy.ravel()])
        z = z.reshape(xx.shape)
        plt.contourf(xx, yy, z, cmap=plt.cm.Paired, alpha=0.8)
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
        plt.xlabel('Length')
        plt.ylabel('Width')
        plt.xlim(xx.min(), xx.max())
        plt.title('SVM Decision Boundary')
        plt.show()
        
    def naive_bayes(X_train,y_train,X_test,y_test,name):
        gnb = GaussianNB()
        gnb.fit(X_train, y_train)
        y_pred_gnb = gnb.predict(X_test)
        accuracy_test = accuracy_score(y_test,y_pred_gnb)
        cm_gnb = confusion_matrix(y_test, y_pred_gnb)
        report = classification_report(y_test, y_pred_gnb)
        print(name, report)
        print(cm_gnb)
        return accuracy_test, cm_gnb
    
    def svm_model(X_train,y_train,X_test,y_test,name):
        svc = SVC()
        svc.fit(X_train, y_train)
        y_pred_svc = svc.predict(X_test)
        accuracy_test = accuracy_score(y_test,y_pred_svc)
        cm_svm = confusion_matrix(y_test, y_pred_svc)
        report = classification_report(y_test, y_pred_svc)
        print(name, report)
        print(cm_svm)
        return accuracy_test, cm_svm

In [9]:
def find_outliers_IQR(df):
    '''
    Descrription: Finds Outliers as per the percentile provided
    '''
    q1=df.quantile(0.25)
    q3=df.quantile(0.75)
    IQR=q3-q1
    outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
    return outliers
def drop_outliers_IQR(df):
    q1=df.quantile(0.25)
    q3=df.quantile(0.75)
    IQR=q3-q1
    not_outliers = df[~((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
    outliers_dropped = outliers.dropna().reset_index()
    return outliers_dropped

In [10]:
def Train_model(X_train,y_train,X_test,y_test,model):
    model.fit(X_train, y_train)
    y_pred_gnb = model.predict(X_test)
    accuracy_test = accuracy_score(y_test,y_pred_gnb)
    cm_gnb = confusion_matrix(y_test, y_pred_gnb)
    report = classification_report(y_test, y_pred_gnb)
    print(report)
    plot_confusion_matrix(model, X_test, y_test)
    return accuracy_test, cm_gnb