In [61]:
import datetime
import warnings
from collections import Counter
from io import StringIO
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import dump
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix, roc_curve, auc, \
    roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

SCORING = {'accuracy': 'accuracy', 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score),
           'AUC': make_scorer(roc_auc_score)}

In [62]:
#importing data and creating test/train features
label_encoder = LabelEncoder()
dfApache = pd.read_excel("C:/Users/chels/Desktop/SampleData/ApacheSample.xlsx")
dfJunit = pd.read_excel("C:/Users/chels/Desktop/SampleData/junitSample.xlsx")
dfOkhttp = pd.read_excel("C:/Users/chels/Desktop/SampleData/okhttpSample.xlsx")
dfRetrofit = pd.read_excel("C:/Users/chels/Desktop/SampleData/retrofitSample.xlsx")
dfSpringBoot = pd.read_excel("C:/Users/chels/Desktop/SampleData/springBootSample.xlsx")
dfApache


trainFeaturesApache, testFeaturesApache, trainLabelsApache, testLabelsApache = train_test_split(
    dfApache['message'], dfApache['label'], test_size=0.25,  random_state=42 
)
trainFeaturesApache = label_encoder.fit_transform(trainFeaturesApache)
testFeaturesApache = label_encoder.fit_transform(testFeaturesApache)

trainFeaturesJunit, testFeaturesJunit, trainLabelsJunit, testLabelsJunit = train_test_split(
    dfJunit['message'], dfJunit['label'], test_size=0.25,  random_state=42 
)
trainFeaturesJunit = label_encoder.fit_transform(trainFeaturesJunit)
testFeaturesJunit = label_encoder.fit_transform(testFeaturesJunit)

trainFeaturesOkhttp, testFeaturesOkhttp, trainLabelsOkhttp, testLabelsOkhttp = train_test_split(
    dfOkhttp['message'], dfOkhttp['label'], test_size=0.25,  random_state=42 
)
trainFeaturesOkhttp = label_encoder.fit_transform(trainFeaturesOkhttp)
testFeaturesOkhttp = label_encoder.fit_transform(testFeaturesOkhttp)

trainFeaturesRetrofit, testFeaturesRetrofit, trainLabelsRetrofit, testLabelsRetrofit = train_test_split(
    dfRetrofit['message'], dfRetrofit['label'], test_size=0.25,  random_state=42 
)
trainFeaturesRetrofit = label_encoder.fit_transform(trainFeaturesRetrofit)
testFeaturesRetrofit = label_encoder.fit_transform(testFeaturesRetrofit)

trainFeaturesSpringBoot, testFeaturesSpringBoot, trainLabelsSpringBoot, testLabelsSpringBoot = train_test_split(
    dfSpringBoot['message'], dfSpringBoot['label'], test_size=0.25,  random_state=42 
)
trainFeaturesSpringBoot = label_encoder.fit_transform(trainFeaturesSpringBoot)
testFeaturesSpringBoot = label_encoder.fit_transform(testFeaturesSpringBoot)

In [73]:
def get_score_by_grid(grid: GridSearchCV):
    print("GridSearchCV is complate!")
    accuRank = grid.cv_results_['rank_test_accuracy']
    preMean = grid.cv_results_['mean_test_precision']
    bestParam = grid.cv_results_['params']
    bestIndex = grid.best_index_
    i = bestIndex
    rank = 1

    while preMean[i] < 0.5:
        rank += 1
        indx = 0
        if rank > 20:
            break
        for num in accuRank:
            if num == rank:
                i = indx
                break
            indx += 1
    bestIndex = i

    res = "refit by:" + str(grid.refit) + " Parameters: " + str(bestParam[bestIndex])
    #logger.info(res)
   # print(res)
    return bestParam[bestIndex]

def KNNClassifier(trainFeatures, trainLabels):
    model = KNeighborsClassifier()
    fold = KFold(n_splits=10, random_state=5, shuffle=True)
    parameter = {'n_neighbors': np.arange(1, 10, 1),
                 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
                 }
    grid = GridSearchCV(estimator=model, param_grid=parameter, cv=fold,
                        scoring=SCORING, refit="accuracy", n_jobs=25)
    grid.fit(trainFeatures, trainLabels)
    bestParameter = get_score_by_grid(grid)
    print("KNN Best using %s " % (bestParameter))
    model = KNeighborsClassifier(n_neighbors=bestParameter['n_neighbors'], algorithm=bestParameter['algorithm'])
    return model



KNNApacheModel = KNNClassifier(trainFeaturesApache.reshape(-1, 1), trainLabelsApache)
KNNApacheModel.fit(trainFeaturesApache.reshape(-1, 1), trainLabelsApache)


GridSearchCV is complate!
KNN Best using {'algorithm': 'auto', 'n_neighbors': 1} 


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


KNeighborsClassifier(n_neighbors=1)

In [77]:
def binClassify(yValid, yPred, yScore):
    accuracy_mean = accuracy_score(yValid, yPred)
    tn, fp, fn, tp = confusion_matrix(yValid, yPred).ravel()
    f1Score = f1_score(yValid, yPred, average=None)
    recallScore = recall_score(yValid, yPred, average=None)
    precisionScore = precision_score(yValid, yPred, average=None)
    weightedPrecision = precision_score(yValid, yPred, average="weighted")
    weightedRecall = recall_score(yValid, yPred, average="weighted")
    weightedF1 = f1_score(yValid, yPred, average="weighted")

    fpr, tpr, thresholds = roc_curve(yValid, yScore, pos_label=0)
    aucArea = auc(fpr, tpr)
    aucScore = roc_auc_score(y_true=yValid, y_score=yScore)

    return weightedPrecision, weightedRecall, weightedF1, accuracy_mean, tn, fp, fn, tp, f1Score, recallScore, precisionScore, fpr, tpr, aucArea, aucScore

def modelScore(testFeatures, testLabels, trainedModel):
    # logger.info(trainedModel.score(testFeatures, testLabels))
    y_valid = testLabels
    y_pred = trainedModel.predict(testFeatures)
    y_score = trainedModel.predict_proba(testFeatures)[:, 1]
    
    return y_pred, y_score, y_valid

    #weightedPrecision, weightedRecall, weightedF1, accuracy_mean, tn, fp, fn, tp, f1Score, recallScore, precisionScore, fpr, tpr, aucArea, aucScore = binClassify(
       # yvalid, y_pred, y_score)

    #return accuracy_mean, precisionScore, recallScore, f1Score, aucScore, tn, fp, fn, tp, weightedPrecision, weightedRecall, weightedF1


y_pred, y_score,y_valid = modelScore(testFeaturesApache.reshape(-1, 1), testLabelsApache, KNNApacheModel)

def calculate_accuracy(y_pred, y_valid):
    correct_predictions = (y_pred == y_valid).sum()
    total_predictions = len(y_valid)
    accuracy = correct_predictions / total_predictions
    return accuracy

accuracy = calculate_accuracy(y_pred, y_valid)
print("Accuracy:", accuracy)

Accuracy: 0.43902439024390244


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.