In [6]:
import os, platform, pprint, sys
import fastai
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import yellowbrick as yb

from fastai.tabular.data import TabularDataLoaders, TabularPandas
from fastai.tabular.all import FillMissing, Categorify, Normalize, tabular_learner, accuracy, ClassificationInterpretation, ShowGraphCallback, RandomSplitter, range_of

from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier

from yellowbrick.model_selection import CVScores, LearningCurve, ValidationCurve


seed: int = 14


# set up pretty printer for easier data evaluation
pretty = pprint.PrettyPrinter(indent=4, width=30).pprint


# declare file paths for the data we will be working on
file_path_1: str = '../data/prepared/baseline/Benign_vs_DDoS.csv'
file_path_2: str = '../data/prepared/timebased/Benign_vs_DDoS.csv'
dataPath   : str = './models'


# enumerate dataset types
Baseline : int = 0
Timebased: int = 1


# print library and python versions for reproducibility
print(
    f'''
    python:\t{platform.python_version()}

    \tfastai:\t\t{fastai.__version__}
    \tmatplotlib:\t{mpl.__version__}
    \tnumpy:\t\t{np.__version__}
    \tpandas:\t\t{pd.__version__}
    \tsklearn:\t{sklearn.__version__}
    \tyellowbrick:\t{yb.__version__}
    '''
)


    python:	3.7.10

    	fastai:		2.4.1
    	matplotlib:	3.3.4
    	numpy:		1.20.3
    	pandas:		1.2.5
    	sklearn:	0.24.2
    	yellowbrick:	1.3.post1
    


In [7]:
def load_data(filePath: str) -> pd.DataFrame:
    '''
        Loads the Dataset from the given filepath and caches it for quick access in the future
        Function will only work when filepath is a .csv file
    '''

    # slice off the ./CSV/ from the filePath
    if filePath[0] == '.' and filePath[1] == '.':
        filePathClean: str = filePath[17::]
        pickleDump: str = f'../data/cache/{filePathClean}.pickle'
    else:
        pickleDump: str = f'../data/cache/{filePath}.pickle'
    
    print(f'Loading Dataset: {filePath}')
    print(f'\tTo Dataset Cache: {pickleDump}\n')
    
    # check if data already exists within cache
    # if not, load data and cache it
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
    else:
        df = pd.read_csv(filePath, low_memory=True)
        df.to_pickle(pickleDump)
    
    return df


class SklearnWrapper(BaseEstimator):
    '''
        A wrapper for fastai learners for creating visualizations using yellowbrick
        code sourced from: 
        forums.fast.ai/t/fastai-with-yellowbrics-how-to-get-roc-curves-more/79408
    '''
    _estimator_type = "classifier"
        
    def __init__(self, model):
        self.model = model
        self.classes_ = list(self.model.dls.y.unique())
    
    def fit(self, X, y):
        pass
        
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))
    
    def get_new_preds(self, X):
        new_to = self.model.dls.valid_ds.new(X)
        new_to.conts = new_to.conts.astype(np.float32)
        new_dl = self.model.dls.valid.new(new_to)
        with self.model.no_bar():
            preds,_,dec_preds = self.model.get_preds(dl=new_dl, with_decoded=True)
        return (preds, dec_preds)

    def predict_proba(self, X):
        return self.get_new_preds(X)[0].numpy()
    
    def predict(self, X):
        return self.get_new_preds(X)[1].numpy()

In [8]:
baseline_df : pd.DataFrame = load_data(file_path_1)
timebased_df: pd.DataFrame = load_data(file_path_2)

Loading Dataset: ../data/prepared/baseline/Benign_vs_DDoS.csv
	To Dataset Cache: ../data/cache/baseline/Benign_vs_DDoS.csv.pickle

Loading Dataset: ../data/prepared/timebased/Benign_vs_DDoS.csv
	To Dataset Cache: ../data/cache/timebased/Benign_vs_DDoS.csv.pickle



In [9]:
def run_experiment(df: pd.DataFrame, name: str) -> tuple:
    '''
        Run binary classification on a given dataframe, saving the model as {name}.model
        returns the 7-tuple with the following indicies:
        viz_data: tuple = (name, model, classes, X_train, y_train, X_test, y_test)
    '''

    # First we split the features into the dependent variable and 
    # continous and categorical features
    dep_var: str = 'Label'
    if 'Protocol' in df.columns:
        categorical_features: list = ['Protocol']
    else:
        categorical_features: list = []
    continuous_features = list(set(df) - set(categorical_features) - set([dep_var]))

    # Next, we set up the feature engineering pipeline, namely filling missing values
    # encoding categorical features, and normalizing the continuous features
    # all within a pipeline to prevent the normalization from leaking details
    # about the test sets through the normalized mapping of the training sets
    procs = [FillMissing, Categorify, Normalize]
    splits = RandomSplitter(valid_pct=0.2, seed=seed)(range_of(df))
    
    # The dataframe is loaded into a fastai datastructure now that 
    # the feature engineering pipeline has been set up
    to = TabularPandas(
        df            , y_names=dep_var                , 
        splits=splits , cat_names=categorical_features ,
        procs=procs   , cont_names=continuous_features , 
    )

    # The dataframe is then converted into a fastai dataset
    dls = to.dataloaders(bs=64)

    # Next, we set up, train, and save the deep neural network
    model = tabular_learner(dls, layers=[50, 28], metrics=accuracy, cbs=ShowGraphCallback)
    model.fit_one_cycle(10)
    model.save(f'{name}.model')

    # We print the results of the training    
    loss, acc = model.validate()
    print('loss {}: accuracy: {:.2f}%'.format(loss, acc*100))

    # A confusion matrix is created to help evaluate the results
    interp = ClassificationInterpretation.from_learner(model)
    interp.plot_confusion_matrix()

    # We extract the training and test datasets from the dataframe
    X_train = to.train.xs.reset_index(drop=True)
    X_test = to.valid.xs.reset_index(drop=True)
    y_train = to.train.ys.values.ravel()
    y_test = to.valid.ys.values.ravel()

    # We wrap our model to make it look like a scikitlearn model
    # for visualization using yellowbrick
    wrapped_model = SklearnWrapper(model)

    # we add a target_type_ attribute to our model so yellowbrick knows how to make the visualizations
    classes = list(model.dls.vocab)
    if len(classes) == 2:
        wrapped_model.target_type_ = 'binary'
    elif len(classes) > 2:  
        wrapped_model.target_type_ = 'multiclass'
    else:
        print('Must be more than one class to perform classification')
        raise ValueError('Wrong number of classes')
    
    # Now that the classifier has been created and trained, we pass out our training values
    # so that yellowbrick can use them to create various visualizations
    viz_data: tuple = (name, wrapped_model, classes, X_train, y_train, X_test, y_test)

    return viz_data


def visualize_confusion_matrix(viz_data: tuple) -> None:
    '''
        Takes a 7-tuple from the run_experiments function and creates a confusion matrix

        viz_data: tuple = (name, model, classes, X_train, y_train, X_test, y_test)
    '''

    visualizer = yb.classifier.ConfusionMatrix(viz_data[1], classes=viz_data[2], title=viz_data[0])
    visualizer.score(viz_data[5], viz_data[6])
    visualizer.show()


def visualize_roc(viz_data: tuple) -> None:
    '''
        Takes a 7-tuple from the run_experiments function and creates a 
        Receiver Operating Characteristic (ROC) Curve

        viz_data: tuple = (name, model, classes, X_train, y_train, X_test, y_test)
    '''

    visualizer = yb.classifier.ROCAUC(viz_data[1], classes=viz_data[2], title=viz_data[0])
    visualizer.score(viz_data[5], viz_data[6])
    visualizer.poof()


def visualize_pr_curve(viz_data: tuple) -> None:
    '''
        Takes a 7-tuple from the run_experiments function and creates a 
        Precision-Recall Curve

        viz_data: tuple = (name, model, classes, X_train, y_train, X_test, y_test)
    '''

    visualizer = yb.classifier.PrecisionRecallCurve(viz_data[1], title=viz_data[0])
    visualizer.score(viz_data[5], viz_data[6])
    visualizer.poof()


def visualize_report(viz_data: tuple) -> None:
    '''
        Takes a 7-tuple from the run_experiments function and creates a report
        detailing the Precision, Recall, f1, and Support scores for all 
        classification outcomes

        viz_data: tuple = (name, model, classes, X_train, y_train, X_test, y_test)
    '''

    visualizer = yb.classifier.ClassificationReport(viz_data[1], classes=viz_data[2], title=viz_data[0], support=True)
    visualizer.score(viz_data[5], viz_data[6])
    visualizer.poof()


def visualize_class_balance(viz_data: tuple) -> None:
    '''
        Takes a 7-tuple from the run_experiments function and creates a histogram
        detailing the balance between classification outcomes

        viz_data: tuple = (name, model, classes, X_train, y_train, X_test, y_test)
    '''

    visualizer = yb.target.ClassBalance(labels=viz_data[0])
    visualizer.fit(viz_data[4], viz_data[6])
    visualizer.show()

In [11]:
import openai

In [12]:
openai.api_key = stuff 

In [274]:
stats = []

In [318]:
def run_gpt3_experiment(df, convert):
    sample = df.sample(n=151, replace=False)
    sample = sample.rename(columns={'Label': 'label'}) 
    test = sample.sample(n=1, replace=False)
    result = test['label']
    y = sample['label']

    del test['label']
    del sample['label']
    Y = y.to_list()
    test = test.to_dict('records')[0] 
    train = sample.to_dict('records')

    # print('test : ', type(test))
    # print(test)
    examples = list(zip(map(convert, train), Y))
    examples = list(map(list, examples))
    return (examples, convert(test))
    model = openai.Classification.create(
        # model='ada',
        model='davinci',
        examples=examples,
        query=convert(test),
        labels=list(df.Label.unique()),
    )
    print(f'Result: {model.label.lower()}, True: {result.values[0].lower()}')
    if model.label.lower() == result.values[0].lower():
        print('correct')
        stats.append(1)
    else:
        print('incorrect')
        stats.append(0)



In [319]:
def spaced_by_line(train):
    # print(type(train))
    # print(train)
    out = ''
    print(type(train))
    for key in train.keys():
        out += f' {key}: {x[key]}\n '
    return out

In [321]:
vals = run_gpt3_experiment(baseline_df, str)


In [322]:
vals[0]

[["{'Protocol': 6, 'Flow Duration': 1, 'Total Fwd Packets': 2, 'Total Backward Packets': 0, 'Total Length of Fwd Packets': 37.0, 'Total Length of Bwd Packets': 0.0, 'Fwd Packet Length Max': 31.0, 'Fwd Packet Length Min': 6.0, 'Fwd Packet Length Mean': 18.5, 'Fwd Packet Length Std': 17.677669529663692, 'Bwd Packet Length Max': 0.0, 'Bwd Packet Length Min': 0.0, 'Bwd Packet Length Mean': 0.0, 'Bwd Packet Length Std': 0.0, 'Flow Bytes/s': 37000000.0, 'Flow Packets/s': 2000000.0, 'Flow IAT Mean': 1.0, 'Flow IAT Std': 0.0, 'Flow IAT Max': 1.0, 'Flow IAT Min': 1.0, 'Fwd IAT Total': 1.0, 'Fwd IAT Mean': 1.0, 'Fwd IAT Std': 0.0, 'Fwd IAT Max': 1.0, 'Fwd IAT Min': 1.0, 'Bwd IAT Total': 0.0, 'Bwd IAT Mean': 0.0, 'Bwd IAT Std': 0.0, 'Bwd IAT Max': 0.0, 'Bwd IAT Min': 0.0, 'Bwd PSH Flags': 0, 'Fwd Header Length': 40, 'Bwd Header Length': 0, 'Fwd Packets/s': 2000000.0, 'Bwd Packets/s': 0.0, 'Min Packet Length': 6.0, 'Max Packet Length': 31.0, 'Packet Length Mean': 22.666666666666664, 'Packet Length

In [323]:
vals[1]

"{'Protocol': 17, 'Flow Duration': 20785, 'Total Fwd Packets': 2, 'Total Backward Packets': 2, 'Total Length of Fwd Packets': 70.0, 'Total Length of Bwd Packets': 174.0, 'Fwd Packet Length Max': 35.0, 'Fwd Packet Length Min': 35.0, 'Fwd Packet Length Mean': 35.0, 'Fwd Packet Length Std': 0.0, 'Bwd Packet Length Max': 87.0, 'Bwd Packet Length Min': 87.0, 'Bwd Packet Length Mean': 87.0, 'Bwd Packet Length Std': 0.0, 'Flow Bytes/s': 11739.235025258598, 'Flow Packets/s': 192.44647582391147, 'Flow IAT Mean': 6928.333333333334, 'Flow IAT Std': 11995.89522850768, 'Flow IAT Max': 20780.0, 'Flow IAT Min': 2.0, 'Fwd IAT Total': 3.0, 'Fwd IAT Mean': 3.0, 'Fwd IAT Std': 0.0, 'Fwd IAT Max': 3.0, 'Fwd IAT Min': 3.0, 'Bwd IAT Total': 2.0, 'Bwd IAT Mean': 2.0, 'Bwd IAT Std': 0.0, 'Bwd IAT Max': 2.0, 'Bwd IAT Min': 2.0, 'Bwd PSH Flags': 0, 'Fwd Header Length': 64, 'Bwd Header Length': 40, 'Fwd Packets/s': 96.22323791195574, 'Bwd Packets/s': 96.22323791195574, 'Min Packet Length': 35.0, 'Max Packet Leng

In [312]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: benign
incorrect


In [313]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: ddos
correct


In [314]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: benign
incorrect


In [315]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: ddos
correct


In [317]:
run_gpt3_experiment(baseline_df, str)


RateLimitError: you exceeded your current quota, please check your plan and billing details

In [None]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: benign
incorrect


In [None]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: benign
incorrect


In [None]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: benign
incorrect


In [None]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: ddos
correct


In [None]:
run_gpt3_experiment(baseline_df, str)


Result: ddos, True: ddos
correct
