In [1]:
import pandas as pd
import numpy as np
import requests
import json
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sqlalchemy import create_engine, text
import logging
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
def fetch_data (url , out):
    try:        
        #get the data from the url
        url_data=requests.get(url)
        #check if there was a response and write into a json file
        if url_data.status_code== 200:
            data=url_data.json()
            with open (out , 'w') as f:
                json.dump(data,f,indent=2)
            #turn the json file into a dataframe
            df = pd.read_json(out)
            
            logging.info("Data successfully fetched.")
            return df
        else:
            logging.error(f"Failed to fetch data, code: {url_data.status_code}")
        
    except requests.exceptions.RequestException as req_err:
        logging.error(f"Requests error: {str(req_err)}")
    
    except json.JSONDecodeError as json_err:
        logging.error(f"JSON decoding error: {str(json_err)}")
    
    except Exception as e:
        logging.exception(f"Error during fetching seen data: {str(e)}")

In [3]:
def fetch_unseen_data (database_name,table):
    try:
        # Create a SQLAlchemy engine
        engine = create_engine(f"mysql+mysqlconnector://root:password@localhost:3306/{database_name}")

        # Use pandas to read the table into a DataFrame
        df = pd.read_sql_table(table, engine)

        # Dispose of the engine
        engine.dispose()
        logging.info("Data successfully fetched.")
        return df
    
    except Exception as e:
        logging.exception(f"Error during fetching unseen data: {str(e)}")

In [4]:
def transform(df ,remove_outliers=True):
    try:
        df.columns = [str(col) for col in df.columns]
        #sort the df attributes
        df=df.sort_index(axis=1)
        #drop duplicate rows
        df = df.drop_duplicates()
        #sort by timestamp
        df = df.sort_values(by='Timestamp')
        #do backward fill
        df=df.bfill()
        #drop not needed columns
        columns_to_drop = ['source_ip', 'timestamp']
        for column in columns_to_drop:
            if column in df.columns:
                df.drop(column, axis=1, inplace=True)
        #get extra attributes from timestamp and drop it
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S')
        df["hour"] = df["Timestamp"].dt.hour
        df["minute"] = df["Timestamp"].dt.minute
        df["day"] = df["Timestamp"].dt.day
        df["month"] = df["Timestamp"].dt.month
        df["year"] = df["Timestamp"].dt.year
        df['seconds'] = df['Timestamp'].dt.second
        df=df.drop(['Timestamp'], axis=1)
        #remove outliers using iqr
        if remove_outliers:
            for columns in df.columns:
                if columns not in ['Label','hour','minute','seconds','year','month','day']:
                    try:
                        Q1 = df[columns].quantile(0.25)
                        Q3 = df[columns].quantile(0.75)
                        IQR = Q3 - Q1
                        outliers_index = ((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR)))
                        df_iqr = df.loc[~outliers_index].reset_index(drop=True)
                    except TypeError as e:
                        print(f"column: {columns} ,error :{e}")
            df = df_iqr
            
        #encode the label
        label_encoder = LabelEncoder() 
        df['Label'] = label_encoder.fit_transform(df['Label'])
        #scale the attributes 
        for columns in df.columns:
            try:     
                if columns not in ['Label', 'Timestamp','minute','hour','seconds','year','month','day','Dst Port']:
                    scaler = StandardScaler()
                    df[columns] = scaler.fit_transform(df[[columns]])
            except TypeError as e:
                print(f"column: {columns} ,error :{e}")
        #create new pkt_ratio attribute from Tot Fwd Pkts and Tot Bwd Pkts
        df['Pkt_Ratio'] = np.divide(df['Tot Fwd Pkts'], df['Tot Bwd Pkts'])
        df['Pkt_Ratio'] = np.nan_to_num(df['Pkt_Ratio'], nan=0)
        
        logging.info("successfully transformed\n")
        return df
            
    except Exception as e:
        logging.exception(f"Error during data transformation: {str(e)}")

In [5]:
def classify(df) :
    try:
        X = df.drop(['Label'], axis=1)
        y = df['Label']
        #train the model using holdout methos
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

        model = RandomForestClassifier(n_estimators=100, random_state=123)
        model.fit(X_train, y_train)
        #predict label on the held out data
        y_pred = model.predict(X_test)
        
        logging.info("Evaluating...\n")
        #evaluating the predictions
        accuracy = accuracy_score(y_test, y_pred)
        classification_report_result = classification_report(y_test, y_pred)

        logging.info(f'known data Accuracy: {accuracy:.5f}\n')
        logging.info('\nClassification Report:\n' + classification_report_result)
        #using cross validation for training and evaluating
        cv_scores = cross_val_score(model, X, y, cv=5)

        logging.info("Cross-Validation Scores for known data: " + str(cv_scores))
        logging.info("seen data Average Accuracy: " + str(np.mean(cv_scores))+"\n")

        y_pred_cv = cross_val_predict(model, X, y, cv=5)
        #print the confusion matrix
        conf_matrix = confusion_matrix(y, y_pred_cv)
        logging.info("Confusion Matrix for seen data:\n" + str(conf_matrix))
        
        return model
    
    except Exception as e:
        logging.error(f"Error during data extraction: {str(e)}")

In [6]:
def prediction(df_unseen_transformed , model):
    try:
        
        #predict the unseen labels
        model_prediction = model.predict(df_unseen_transformed.drop(columns=['Label']))
        #check the accuracy of the model
        accuracy_score_unseen = accuracy_score(df_unseen_transformed['Label'], model_prediction)
        logging.info(f'The accuracy of the model: {accuracy_score_unseen:.5f}+"\n"')
        #create a confusion matrix
        conf_matrix = confusion_matrix(df_unseen_transformed['Label'], model_prediction)
        logging.info("Confusion Matrix for unseen data:\n" + str(conf_matrix))
        
    except Exception as e:
        logging.error(f"Error during prediction: {str(e)}")

In [7]:
def load(df, database_name, table):
    try:
        #create the sql engine
        engine = create_engine("mysql+mysqlconnector://root:password@localhost:3306")
        connection = engine.connect()
        #create the database
        create_database_query = text(f"CREATE DATABASE IF NOT EXISTS {database_name}")
        connection.execute(create_database_query)
        #create the table from the df
        engine = create_engine(f"mysql+mysqlconnector://root:password@localhost:3306/{database_name}")
        df.to_sql(table, con=engine, if_exists='replace', index=False)
        
        logging.info("loaded succsessfully\n")
        
    except Exception as e:
        logging.error(f"Error during data loading: {str(e)}")


In [8]:
intrusion_url ='http://87.236.232.200:5000/data'
output_file_name = "intrusion_dataset.json"
prediction_data= pd.read_csv("intrusion_unseen.csv")

def monitor_pipeline():
    while True:
        try:
            logging.info("Starting pipeline execution...")
            
            logging.info("Extracting seen data...")
            df = fetch_data(intrusion_url , output_file_name)
            
            logging.info("loading unseen data...")
            load(prediction_data,'data','intrusion_unseen')
            
            logging.info("Extracting unseen data...")
            df_unseen=fetch_unseen_data('data','intrusion_unseen')
            
            logging.info("Transforming seen data...")
            df_transformed = transform(df)
            
            logging.info("Transforming unseen data...")
            df_unseen_transformed = transform(df_unseen,False)
            
            logging.info("classifying seen data...")
            model = classify(df_transformed)
            
            logging.info("predicting unseen data...")
            prediction(df_unseen_transformed , model)
            
            logging.info("loading data...")
            load(df_transformed,'data','engineer')
            
        except Exception as e:
            logging.error(f"Pipeline execution failed: {str(e)}")
            
        logging.info("Waiting for the next iteration...")
        time.sleep(15)

In [9]:
monitor_pipeline()

2024-01-10 13:25:15,765 - INFO - Starting pipeline execution...
2024-01-10 13:25:15,766 - INFO - Extracting seen data...
2024-01-10 13:25:17,306 - INFO - Data successfully fetched.
2024-01-10 13:25:17,308 - INFO - loading unseen data...
2024-01-10 13:25:17,821 - INFO - loaded succsessfully

2024-01-10 13:25:17,822 - INFO - Extracting unseen data...
2024-01-10 13:25:17,914 - INFO - Data successfully fetched.
2024-01-10 13:25:17,915 - INFO - Transforming seen data...
2024-01-10 13:25:18,509 - INFO - successfully transformed

2024-01-10 13:25:18,510 - INFO - Transforming unseen data...
2024-01-10 13:25:18,833 - INFO - successfully transformed

2024-01-10 13:25:18,834 - INFO - classifying seen data...
2024-01-10 13:25:19,117 - INFO - Evaluating...

2024-01-10 13:25:19,130 - INFO - known data Accuracy: 1.00000

2024-01-10 13:25:19,131 - INFO - 
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        80
           1    