In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
import matplotlib.patches as mpatches

sns.set_palette("Set2")

# Making sure all the data is printed:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [2]:
# Global dictionary for protocol mappings
PROTOCOL_MAPPING = {
    1: 'ICMP',  # Internet Control Message Protocol
    6: 'TCP',  # Transmission Control Protocol
    17: 'UDP',  # User Datagram Protocol
    50: 'ESP',  # Encapsulating Security Payload
    51: 'AH',  # Authentication Header
    8: 'EGP',  # Exterior Gateway Protocol
    # Add other protocol mappings as needed
}

# Global Value to import the "dapt2020" files:
CSV_DIR = "/Users/admin/PycharmProjects/pythonProject3/hakeriot/mine"

In [3]:
def get_protocol_name(protocol_number):
    # Function to get the protocol name
    """Return the human-readable name for a given protocol number."""
    return PROTOCOL_MAPPING.get(protocol_number, 'Other')

In [4]:
def categorize_ports(port):
    """
    Categorize ports into Well-Known Ports, Registered Ports, and Dynamic/Private Ports.

    - Well-Known Ports: 0–1023
    - Registered Ports: 1024–49151
    - Dynamic/Private Ports: 49152–65535
    """
    if port < 1024:
        return 'Well-Known Ports'
    elif 1024 <= port <= 49151:
        return 'Registered Ports'
    else:
        return 'Dynamic/Private Ports'

    # Define protocol categories

In [5]:
def add_ports_protocols_col_toDF(df):
    # Correctly applying the categorization function to each port in 'Src Port'
    df['SrcPort_categorical'] = df['Src Port'].apply(
        lambda port: categorize_ports(port) if port is not None else None)

    # Correctly applying the protocol naming function to each protocol in 'Protocol'
    df['Protocol_categorical'] = df['Protocol'].apply(
        lambda protocol: get_protocol_name(protocol) if protocol is not None else None)

In [6]:
def csv_to_df(csv_file, dir=CSV_DIR):
    df = pd.read_csv(f"{dir}/{csv_file}")
    df._filename = csv_file
    return df


## Monday
df_pvt_monday = csv_to_df("enp0s3-monday-pvt.pcap_Flow.csv")
df_monday = csv_to_df("enp0s3-monday.pcap_Flow.csv")

## Tuesday
df_pvt_tuesday = csv_to_df("enp0s3-pvt-tuesday.pcap_Flow.csv")
df_public_tuesday = csv_to_df("enp0s3-public-tuesday.pcap_Flow.csv")

## Wednesday
df_pvt_wednesday = csv_to_df("enp0s3-pvt-wednesday.pcap_Flow.csv")
df_public_wednesday = csv_to_df("enp0s3-public-wednesday.pcap_Flow.csv")

## Thursday
df_pvt_thursday = csv_to_df("enp0s3-pvt-thursday.pcap_Flow.csv")
df_public_thursday = csv_to_df("enp0s3-public-thursday.pcap_Flow.csv")

## Friday
df_tcpdump_pvt_friday = csv_to_df("enp0s3-tcpdump-pvt-friday.pcap_Flow.csv")
df_tcpdump_friday = csv_to_df("enp0s3-tcpdump-friday.pcap_Flow.csv")


In [7]:
def describe_df(df):
    """
    Provides a comprehensive overview of a pandas DataFrame including its dimensions,
    size, shape, preview of the first few rows, descriptive statistics for numeric
    and categorical data, and information about missing values. Checks and reports if
    the entire DataFrame or any column is entirely composed of missing values.

    Args:
    df (pandas.DataFrame): The DataFrame to describe.

    Returns:
    None: Prints the description to the console.
    """
    try:
        # Check if the entire DataFrame is null
        if df.isnull().all().all():
            print("The entire DataFrame is null.")
            return

        # Basic DataFrame information
        print(f"DataFrame Description:")
        print(f"Dimensions: {df.ndim}")
        print(f"Total elements: {df.size}")
        print(f"Shape (rows, columns): {df.shape}")
        '''  print("\nFirst 3 rows of the DataFrame:")
        print(df.head(3))'''

        # Descriptive statistics
        print("\nDescriptive Statistics for Numeric Columns:")
        print(df.describe(include="number"))
        print("\nDescriptive Statistics for Categorical Columns:")
        print(df.describe(include="object"))
        
         # Display unique source details
        UniqueSrcIP = pd.DataFrame({'Src IP': df['SrcIP_uniq']})
        print("Unique Src Details:")
        description_src = UniqueSrcIP['Src IP'].describe(include='all')
        print(f"Uniq Src-IP's Describe:\n{description_src}")
        print(f"Uniq Src-IP's Top Value:\n{description_src['top']} (Count: {description_src['freq']})\n")
        print(f"Uniq Src-IP's Full List:\n{UniqueSrcIP.head()}\n")

        # Ports
        UniqueSrcPorts = pd.DataFrame({'Src Port': df['SrcPort_uniq']})
        description_ports = UniqueSrcPorts['Src Port'].describe(include='all')

        # Ensure that 'top' and 'freq' are present in the description
        top_value = description_ports.get('top', 'No data')
        freq_value = description_ports.get('freq', 'No data')

        print(f"Uniq Src-Ports Top Value:\n{top_value} (Count: {freq_value})\n")
        print(f"Uniq Src-Ports Full List:\n{UniqueSrcPorts.head()}\n")

        # Protocol
        UniqueProtocol = pd.DataFrame({'Protocol': df['Protocol_uniq']})
        description_portss = UniqueProtocol['Protocol'].describe(include='all')

        # Ensure that 'top' and 'freq' are present in the description
        top_value = description_portss.get('top', 'No data')
        freq_value = description_portss.get('freq', 'No data')

        print(f"Uniq Protocol Top Value:\n{top_value} (Count: {freq_value})\n")
        print(f"Uniq Protocol Full List:\n{UniqueProtocol.head()}\n")

        '''for col in df.columns:
            # Check if the column contains lists
            if df[col].apply(lambda x: isinstance(x, list)).any():
                # Flatten the list column into a single list for value counts
                flattened_values = [item for sublist in df[col] for item in sublist]
                flattened_series = pd.Series(flattened_values)

                # Perform value counts and describe
                value_counts = flattened_series.value_counts()
                print(f"Column: {col}")
                print("Value Counts Description:")
                print(value_counts.describe())
            '''    

        ''' # Information on missing data
        print("\nMissing Data Status (first few rows):")
        print(df.isna().head())
        print("\nCount of missing values in each column:")
        missing_data = df.isna().sum()'''

        '''# Check for columns that are entirely null
        all_null_columns = missing_data[missing_data == len(df)].index.tolist()
        if all_null_columns:
            print(f"Columns entirely composed of missing values: {all_null_columns}")
        else:
            print("No columns are entirely null.")'''

    except Exception as e:
        print("Error processing DataFrame:", str(e))
    else:
        print("____________________________________________________________________________\n")

In [8]:
def detect_zscore_outliers_iqr(df, column):
    """Detects outliers for a threshold in a DataFrame column using the IQR method."""
    # Using ZScore to normalize the data - considering more robust or tailored approaches to detect outliers.
    zscore = stats.zscore(df[column])
    df[f'Normalized_{column}'] = zscore
    iqr = 0.5
    lower_bound = 0.25 - 1.5 * iqr
    upper_bound = 0.75 + 1.5 * iqr

    outlier_dict = {
        "dict_name": "outlier_Normalized_dict",
        "column": column,
        "lower_bound": lower_bound,
        "upper_bound": upper_bound,
        "iqr": iqr,
        "std": df[f"Normalized_{column}"].std()
    }
    return outlier_dict

In [9]:
def detect_outliers_iqr(df, column):
    # Normalized Outlier
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    iqr = Q3 - Q1
    lower_bound = Q1 - 1.5 * iqr
    upper_bound = Q3 + 1.5 * iqr

    outlier_dict = {
        "dict_name": "outlier_dict",
        # "df_pointer": point_to_df,
        "column": column,
        "lower_bound": lower_bound,
        "upper_bound": upper_bound,
        "iqr": iqr,
        "std": df[column].std()
    }
    return outlier_dict

In [10]:
def add_ifOutlier_columns(df, column, CSV_filename=None):
    if f'Outlier_Normalized_{column}_U/L' not in df:
        outlier_dict = detect_zscore_outliers_iqr(df, column)
        df[f'Outlier_Normalized_{column}_U/L'] = np.where(df[f'Normalized_{column}'] < outlier_dict['lower_bound'],
                                                          'lower_bound', np.where(
                df[f'Normalized_{column}'] > outlier_dict['upper_bound'],
                'upper_bound', None))

    if f'Outlier_{column}_U/L' not in df:
        outlier_dict = detect_outliers_iqr(df, column)
        df[f'Outlier_{column}_U/L'] = (
            np.where((outlier_dict['lower_bound'] >= 0) and df[column] < outlier_dict['lower_bound'],
                     'lower_bound', np.where(df[column] > outlier_dict['upper_bound'], 'upper_bound', None)))

    for _, row in df.iterrows():
        if row.get(f'Outlier_Normalized_{column}_U/L') or row.get(f'Outlier_{column}_U/L'):
            print(f"There are outlier values for column {column} of grouped DF of {CSV_filename}.")
        return

In [11]:
def group_data(df):
    add_ports_protocols_col_toDF(df)

    # Parse the full datetime
    df['Datetime'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %I:%M:%S %p')

    # Set 'Datetime' as the index
     # Set 'Datetime' as the index
    df.set_index('Datetime', inplace=True)

    # Group data:
    # Merge the summarized data and detailed data using the 'group_id' key.
    # The 'group_id' serves as the unique identifier connecting the summarized statistics
    # with the detailed information about the sources (Src IP, Src Port, Protocol):
    df['group_DosDetect_id'] = df.groupby(['Dst IP', pd.Grouper(freq='5min')]).ngroup()

    # - 'grouped_summary': Contains aggregate statistics (e.g., counts, averages) per Dst IP and time interval.
    grouped_summary = df.groupby(['group_DosDetect_id', 'Dst IP', pd.Grouper(freq='5min')]).agg(
        # Aggregate summarized statistics per group
        CountRequests=('Src IP', 'count'),
        CountSrc_uniq=('Src IP', 'nunique'),  # Count unique source IPs
        Flow_Packets_s_avg=('Flow Packets/s', 'mean'),
        Flow_Bytes_s_avg=('Flow Bytes/s', 'mean'),
        SYN_count_sum=('SYN Flag Count', 'sum'),
        ACK_count_sum=('ACK Flag Count', 'sum'),
        Bwd_sum=('Total Bwd packets', 'sum'),
        Fwd_sum=('Total Fwd Packet', 'sum'),
    ).reset_index()

    # Calculating ratios separately to avoid referencing within the aggregation function
    grouped_summary['SYN_ACK_Ratio'] = grouped_summary['SYN_count_sum'] /((grouped_summary['ACK_count_sum']) + 1)
    grouped_summary['ACK_SYN_Ratio'] = grouped_summary['ACK_count_sum'] / (grouped_summary['SYN_count_sum'] + 1)
    grouped_summary['Bwd_Fwd_Ratio'] = grouped_summary['Bwd_sum'] / (grouped_summary['Fwd_sum'] + 1)

    # Format 'HourTime' as a datetime column for sorting
    grouped_summary['HourTime'] = grouped_summary['Datetime'].dt.floor('30min').dt.strftime('%H:%M')
    # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
    grouped_summary = grouped_summary.sort_values(by='Dst IP')
    grouped_summary = grouped_summary.sort_values(by='HourTime')

    # Detect outliers using IQR
    for col in ['CountRequests', 'Flow_Packets_s_avg', 'Flow_Bytes_s_avg', 'SYN_count_sum', 'ACK_count_sum',
                'SYN_ACK_Ratio', 'ACK_SYN_Ratio', 'Bwd_sum', 'Fwd_sum', 'Bwd_Fwd_Ratio']:
        add_ifOutlier_columns(grouped_summary, col, df._filename)

    # - 'grouped_src_details':
    # Contains detailed lists of Src IPs, their ports, and protocols associated with each group.
    # Extracting unique source details per group
    grouped_src_details = df.groupby('group_DosDetect_id').apply(
        lambda group: pd.Series({
            'Src Details': group[
                ['Src IP', 'Src Port', 'SrcPort_categorical', 'Protocol', 'Protocol_categorical']].to_dict('records'),
            'SrcIP_uniq': group['Src IP'].unique().tolist(),
            'SrcPort_uniq': group['Src Port'].unique().tolist(),
            'SrcPort_categorical': group['SrcPort_categorical'].unique().tolist(),
            'Protocol_uniq': group['Protocol'].unique().tolist(),
            'Protocol_categorical': group['Protocol_categorical'].unique().tolist(),
            
            # count of port types
            'Well_Known_Port_Count': sum(group['SrcPort_categorical'] == 'Well-Known Ports'),
            'Registered_Port_Count': sum(group['SrcPort_categorical'] == 'Registered Ports'),
            'Dynamic_Private_Port_Count': sum(group['SrcPort_categorical'] == 'Dynamic/Private Ports'),

            # count of protocol types
            'Protocol_TCP_Count': sum(group['Protocol_categorical'] == 'TCP'),
            'Protocol_UDP_Count': sum(group['Protocol_categorical'] == 'UDP'),
            'Protocol_ICMP_Count': sum(group['Protocol_categorical'] == 'ICMP'),
            'Other_Protocol_Count': sum(group['Protocol_categorical'] == 'Other'),
            })
    ).reset_index()

    # - 'Src Details': Contains detailed dictionaries with Src IP, Src Port, and Protocol for each group to check their sets.
    # - 'SrcPort_uniq': Lists all unique source ports within the group.
    # - 'SrcPort_categorical': Groups source ports into predefined categories (Well-Known, Registered, Dynamic/Private).
    # - 'Protocol_uniq': Lists all unique protocol numbers within the group.
    # - 'Protocol_categorical': Maps protocol numbers to human-readable protocol names.
    # - 'Well_Known_Port_Count'/'Registered_Port_Count'/'Dynamic_Private_Port_Count' :count of port types.
    # - 'Protocol_TCP_Count'/'Protocol_UDP_Count'/'Protocol_ICMP_Count'/'Other_Protocol_Count': count of protocol types.
            

    # By performing the merge:
    # A combined DataFrame where each row provides both high-level statistics and source-level details for analysis.
    merged_data = grouped_summary.merge(grouped_src_details, on='group_DosDetect_id', how='inner')
    # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
    merged_data = merged_data.sort_values(by='Dst IP')
    merged_data = merged_data.sort_values(by='HourTime')

    return merged_data

In [12]:
def graph_for_outlierCol(df, outlier_dict, normData=None, outlierNormal_dict=None, attack=None):
    column = outlier_dict['column']
    # Ensure the column exists in the DataFrame
    if column in df.columns:
        # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
        if not pd.api.types.is_datetime64_any_dtype(df['HourTime']):
            df['HourTime'] = pd.to_datetime(df['Datetime']).dt.floor('30min').dt.strftime('%H:%M')
        # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
        df = df.sort_values(by='Dst IP')
        df = df.sort_values(by='HourTime')

        # Calculate the necessary statistics for the column
        col_mid = df[column].median()
        # col_std = df[column].std()
        lower_bound = outlier_dict['lower_bound']
        upper_bound = outlier_dict['upper_bound']

        # Plot the data with horizontal lines for thresholds
        plt.figure(figsize=(18, 10))
        sns.lineplot(data=df, x='HourTime', y=column, marker='o', color='y', label='Analysed Data')
        # Add horizontal lines to indicate statistical thresholds
        plt.axhline(y=col_mid, color='blue', linestyle='dashed', linewidth=2, label='Median')
        # plt.axhline(y=col_mid - col_std, color='sky blue', linestyle='dashed', linewidth=2, label='-1 STD')
        # plt.axhline(y=col_mid + col_std, color='sky blue', linestyle='dashed', linewidth=2, label='+1 STD')
        plt.axhline(y=lower_bound, color='y', linestyle='dashed', linewidth=2, label='Lower Bound')
        plt.axhline(y=upper_bound, color='y', linestyle='dashed', linewidth=2, label='Upper Bound')
        # Add titles, labels, and legend for clarity
        plt.title(f"{column} Over Time with Horizontal Thresholds", fontsize=14,
                  fontweight='bold')
        plt.xlabel("Time (Hour:Minute)", fontsize=12)
        plt.ylabel(f"{column} Values", fontsize=12)
        plt.xticks(rotation=45)
        plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
        plt.tight_layout()
        plt.show()

        flag = False
        plt.figure(figsize=(12, 6))
        # Normal data
        if not normData.empty:
            # Ensure the column exists in the DataFrame
            if column in normData.columns:
                flag = True
                # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
                if not pd.api.types.is_datetime64_any_dtype(df['HourTime']):
                    normData['HourTime'] = pd.to_datetime(normData['Datetime']).dt.floor('30min').dt.strftime('%H:%M')
                # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
                normData = normData.sort_values(by='Dst IP')
                normData = normData.sort_values(by='HourTime')
                sns.lineplot(data=df, x='HourTime', y=column, marker='o', color='y', label='Compared Data')
                # Calculate the necessary statistics for the column from normal data
                n_col_mid = normData[column].median()
                # n_col_std = normData[nColumn].std()
                lower_nBound = outlierNormal_dict['lower_bound']
                upper_nBound = outlierNormal_dict['upper_bound']

                sns.lineplot(data=normData, x='HourTime', y=column, marker='o', color='c', label='Normal Data')
                # Add horizontal lines to indicate statistical thresholds
                plt.axhline(y=n_col_mid, color='r', linestyle='dashed', linewidth=2, label='Normal Median')
                # plt.axhline(y=n_col_mid - n_col_std, color='gray', linestyle='dashed', linewidth=2, label='-1 Normal STD')
                # plt.axhline(y=n_col_mid + n_col_std, color='gray', linestyle='dashed', linewidth=2, label='+1 Normal STD')
                plt.axhline(y=lower_nBound, color='g', linestyle='dashed', linewidth=2, label='Lower Normal Bound')
                plt.axhline(y=upper_nBound, color='g', linestyle='dashed', linewidth=2, label='Upper Normal Bound')
                # Add titles, labels, and legend for clarity
                plt.title(f"{column} Over Time with Horizontal Thresholds:\nNormal Data vs Compared Data", fontsize=14,
                          fontweight='bold')
        else:
            sns.lineplot(data=df, x='HourTime', y=column, marker='o', color="orange", label='Analysed Data')
            # Add horizontal lines to indicate statistical thresholds
            plt.axhline(y=col_mid, color='blue', linestyle='dashed', linewidth=2, label='Median')
            # plt.axhline(y=col_mid - col_std, color='sky blue', linestyle='dashed', linewidth=2, label='-1 STD')
            # plt.axhline(y=col_mid + col_std, color='sky blue', linestyle='dashed', linewidth=2, label='+1 STD')
            plt.axhline(y=lower_bound, color='y', linestyle='dashed', linewidth=2, label='Lower Bound')
            plt.axhline(y=upper_bound, color='y', linestyle='dashed', linewidth=2, label='Upper Bound')
            # Add titles, labels, and legend for clarity
            plt.title(f"{column} Over Time with Horizontal Thresholds", fontsize=14,
                      fontweight='bold')
        if attack is not None and not attack.empty:
            #and (attack.group_DosDetect_id in df.group_DosDetect_id):
            if column in attack.columns:
                flag = True
                # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
                if not np.issubdtype(attack['HourTime'].dtype, np.datetime64):
                    attack['HourTime'] = attack['Datetime'].dt.floor('30min').dt.strftime('%H:%M')
                # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
                attack = attack.sort_values(by='Dst IP')
                attack = attack.sort_values(by='HourTime')
                sns.lineplot(data=attack, x='HourTime', y=column, marker='o', color="orange", label=f'attack_{attack['attack_type']}')
        if flag:#if we have an attack to show or a normal data
            plt.xlabel("Time (Hour:Minute)", fontsize=12)
            plt.xticks(rotation=45)
            plt.ylabel(f"{column} Values", fontsize=12)
            plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
            plt.tight_layout()
            plt.show()

        # --- Notes for Horizontal Lines ---
        # Horizontal lines highlight statistical thresholds (median, ±std, and bounds) on the y-axis.
        # This is useful to identify patterns or anomalies in the column's value distribution over time.

    else:
        return None

In [13]:
def graphs_for_outliers(df, normal_data=None, attack=None):
    flag = True
    zscore_N_dict = None
    outlier_N_dict = None
    for col in df.columns:
        for _, row in df.iterrows():
            if (row.get(f'Outlier_Normalized_{col}_U/L') or row.get(f'Outlier_{col}_U/L')) and flag:
                # Transform the dictionaries into DataFrames
                zscore_dict = detect_zscore_outliers_iqr(df, col)
                outlier_dict = detect_outliers_iqr(df, col)
                df1 = pd.DataFrame.from_dict(zscore_dict, orient='index', columns=['zscore_values'])
                df2 = pd.DataFrame.from_dict(outlier_dict, orient='index', columns=['outlier_values'])
                # Combine the DataFrames (e.g., vertically or horizontally)
                # Horizontally: Adding columns
                combined_df_horizontal = pd.concat([df1, df2], axis=1)
                if normal_data.empty:
                    print(combined_df_horizontal)
                    flag = False
                else:
                    if row[col] > detect_zscore_outliers_iqr(normal_data, col)['upper_bound'] or row[
                        f"Normalized_{col}"] > \
                            detect_outliers_iqr(normal_data, col)['upper_bound'] or row[col] < \
                            detect_zscore_outliers_iqr(normal_data, col)['lower_bound'] or row[f"Normalized_{col}"] < \
                            detect_outliers_iqr(normal_data, col)['lower_bound']:
                        print(combined_df_horizontal)
                        zscore_N_dict = detect_zscore_outliers_iqr(df, col)
                        outlier_N_dict = detect_outliers_iqr(df, col)
                        flag = False

                graph_for_outlierCol(df, zscore_dict, normal_data, zscore_N_dict, attack)
                graph_for_outlierCol(df, outlier_dict, normal_data, outlier_N_dict, attack)

        if not flag:
            print(f"No Outlier values for {col} in this DF")
        flag = True

In [14]:
def classify_attack(row, df_filename, dict_dos_attacks, key, normal_data=None):
    # This function returns the type of attack based on the row data
    attack_type = 'Normal'
    column = None
    Ratio = None

    # Classifies the type of DoS to ensure the anomaly represents an attack:
    # Calls detect_outliers_iqr_dict() for classification.

    if row['Outlier_Normalized_SYN_count_sum_U/L'] == "upper_bound" and \
            row['Outlier_SYN_ACK_Ratio_U/L'] == "upper_bound" and \
            row['Outlier_Flow_Packets_s_avg_U/L'] == "upper_bound":
        flag = True
        """
            SYN Flood Attack
            Explanation of SYN Flood:
            TCP Protocol "3 Way Handshake" exploitation attack.

            An attacker sends a large number of SYN packets to initiate a TCP connection but doesn't complete 
            the "3 Way Handshake" by sending ACK packets. This behavior causes server resources to be exhausted
            because connections remain incomplete.

            Flow_Packets/s >> upper bound:

            The target is overwhelmed by the high volume of packets rather than their payload size.

            High SYN/ACK Ratio indicates SYN Flood:
        """
        if not normal_data.empty:
            columns_to_check = ['Normalized_SYN_count_sum', 'Normalized_SYN_ACK_Ratio', 'Flow_Packets_s_avg']
            flag = False
            for column in columns_to_check:
                # Adjusted condition to check against normal thresholds
                if "Normalized" in column:
                    if row[column] > detect_zscore_outliers_iqr(normal_data, column)['upper_bound']:
                        flag = True
                        break  # Assuming any single outlier condition triggers the classification
                else:
                    if row[column] > detect_outliers_iqr(normal_data, column)['upper_bound']:
                        flag = False
                        break  # Assuming any single outlier condition triggers the classification
        if flag:
            attack_type = 'SYN_Flood'
            column = 'SYN_ACK_Ratio'
            Ratio = row['SYN_ACK_Ratio']

        elif row[f"Outlier_Normalized_CountRequests_U/L"] == "upper_bound" and \
                row["Outlier_Normalized_Bwd_Fwd_Ratio_U/L"] == "upper_bound" and \
                row["Outlier_Normalized_Bwd_sum_U/L"] == "upper_bound" and \
                row['Outlier_Flow_Packets_s_avg_U/L'] == "upper_bound" and \
                row['Outlier_Flow_Bytes_s_avg_U/L'] == "upper_bound":
            flag = True
            """  
                Reflection Attack (Amplification)
                just outlier and mot normalize because we want to understand if its big not huge - fix explenation
                Explanation of Reflection Attack:
                Usually happens in UDP Protocol exploitation attack. - fix explenation

                The attacker uses third-party servers (e.g., DNS, NTP) to reflect requests towards the victim. 
                By spoofing the victim's IP address, the attacker tricks servers into sending large responses 
                to the victim, overwhelming its resources.
                This method often leverages stateless ports and services, such as DNS, NTP, or SSDP.

                Backward traffic (Bwd) is significantly higher due to the large volume of responses sent to the victim.
                Check for outliers and normal thresholds
            """
            if not normal_data.empty:
                columns_to_check = ['Normalized_CountRequests', 'Normalized_Bwd_Fwd_Ratio', 'Bwd_sum',
                                    'Flow_Packets_s_avg', 'Flow_Bytes_s_avg']
                flag = False
                for column in columns_to_check:
                    # Adjusted condition to check against normal thresholds
                    if "Normalized" in column:
                        if row[column] > detect_zscore_outliers_iqr(normal_data, column)['upper_bound']:
                            flag = True
                            break  # Assuming any single outlier condition triggers the classification
                    else:
                        if row[column] > detect_outliers_iqr(normal_data, column)['upper_bound']:
                            flag = False
                            break  # Assuming any single outlier condition triggers the classification
            if flag:
                attack_type = 'Reflection_Attack'
                column = 'Bwd_Fwd_Ratio'
                Ratio = row['Bwd_Fwd_Ratio']

        # ACK Flood Attack
        elif row["Outlier_Normalized_ACK_count_sum_U/L"] == "upper_bound" and \
                row['Outlier_ACK_SYN_Ratio_U/L'] == "upper_bound" and \
                row['Outlier_Flow_Packets_s_avg_U/L'] == "upper_bound" and \
                row['Outlier_SYN_count_sum_U/L'] == "lower_bound":
            flag = True
            """ 
                ACK Flood Attack
                TCP Protocol exploitation attack.

                The attacker sends a high volume of ACK packets, often with little or no payload,
                to overwhelm the target. These packets require the target to process each ACK,
                causing a load on its resources.

                # Flow_Packets/s >> upper bound:
                The attack is effective because of the sheer number of packets rather than their size or payload.
                # Flow_Bytes/s >> lower bound:
                The payload of these packets is minimal or even empty, making the focus entirely on the number of packets.
                # Fwd/Bwd Ratio ≈ 1:
                The Fwd/Bwd ratio is close to 1 because each packet sent to the target
                results in a nearly identical response, creating balanced traffic.
            """
            if not normal_data.empty:
                columns_to_check = ['Normalized_ACK_count_sum', 'Normalized_ACK_SYN_Ratio', 'Flow_Packets_s_avg']
                flag = False
                for column in columns_to_check:
                    # Adjusted condition to check against normal thresholds
                    if "Normalized" in column:
                        if row[column] > detect_zscore_outliers_iqr(normal_data, column)['upper_bound']:
                            flag = True
                            break  # Assuming any single outlier condition triggers the classification
                    else:
                        if row[column] > detect_outliers_iqr(normal_data, column)['upper_bound']:
                            flag = False
                            break  # Assuming any single outlier condition triggers the classification
            if flag:
                attack_type = 'Reflection_Attack'
                column = 'Bwd_Fwd_Ratio'
                Ratio = row['Bwd_Fwd_Ratio']

    if attack_type != 'Normal':
        dict_dos_attacks[key] = {
            "Attack_id": f"{key}-{row.group_DosDetect_id}",
            "group_DosDetect_id": row.group_DosDetect_id,
            "Source_dfName": df_filename,
            "attack_type": attack_type,
            f"{column}": Ratio,
            "Dst IP": row['Dst IP'],
            "CountRequests": row['CountRequests'],
            "CountSrc_uniq": row['CountSrc_uniq'],  # Count unique source IPs
            "Datetime": row['Datetime'],
            "HourTime": row['HourTime'],
            # Calculate statistics for SYN Flag Count
            "SYN_count_sum": row.SYN_count_sum,
            # Calculate statistics for ACK Flag Count
            "ACK_count_sum": row.ACK_count_sum,
            # Calculate statistics for Fwd Packet Flag Count
            "Fwd_sum": row.Fwd_sum,
            # Calculate statistics for Bwd Packet Flag Count
            "Bwd_sum": row.Bwd_sum,
            # Avg Flow Packets/ Bytes
            "Flow_Packets_s_avg": row.Flow_Packets_s_avg,
            "Flow_Bytes_s_avg": row.Flow_Bytes_s_avg,
            # Source details
            "Src Details": row['Src Details'],
            "SrcIP_uniq": row['SrcIP_uniq'],
            "SrcPort_uniq": row.SrcPort_uniq,
            "SrcPort_categorical": row.SrcPort_categorical,
            "Protocol_uniq": row.Protocol_uniq,
            "Protocol_categorical": row.Protocol_categorical,
            # Port categorial count
            'Well_Known_Port_Count': row.Well_Known_Port_Count,
            'Registered_Port_Count': row.Registered_Port_Count,
            'Dynamic_Private_Port_Count': row.Dynamic_Private_Port_Count,
            # Protocol categorial count
            'Protocol_TCP_Count': row.Protocol_TCP_Count,
            'Protocol_UDP_Count': row.Protocol_UDP_Count,
            'Protocol_ICMP_Count': row.Protocol_TCP_Count,
            'Other_Protocol_Count': row.Other_Protocol_Count,
        }

In [15]:
def Reflection_Attack(df_of_attack, df):
    # Ensuring the filter data for Reflection Attacks for graphs 
    reflection_attacks = df_of_attack[df_of_attack['attack_type'] == 'Reflection_Attack']
    if not np.issubdtype(reflection_attacks['HourTime'].dtype, np.datetime64):
                    reflection_attacks['HourTime'] = reflection_attacks['Datetime'].dt.floor('30min').dt.strftime('%H:%M')
    # Sort Reflection Attack data by 'HourTime'
    reflection_attacks = reflection_attacks.sort_values(by='Dst IP')
    reflection_attacks = reflection_attacks.sort_values(by='HourTime')
    
    # Plot Flow_Packets_s_avg over time
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Flow_Packets_s_avg', marker='o')
    plt.title("Flow Packets Per Hour During Reflection Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packets/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Flow_Bytes_s_avg over time
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Flow_Bytes_s_avg', marker='o')
    plt.title("Flow Bytes Per Hour During Reflection Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Flow Packets and Bytes Over Time
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Flow_Packets_s_avg', marker='o', label='Flow Packets')
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Flow_Bytes_s_avg', marker='o', label='Flow Bytes')
    plt.title("Flow Packets and Bytes Over Time During Reflection Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packets/s and Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Bwd and Fwd Flow Packets Over Time
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Bwd_sum', marker='o', label='Bwd Packet')
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Fwd_sum', marker='o', label='Fwd Packet')
    plt.title("Bwd and Fwd Flow Packets Over Time During Reflection Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packets/s and Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Flow Packets and Bytes Over Time
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Flow_Packets_s_avg', marker='o',
                 label='Flow Packets - Reflection Attack')
    sns.lineplot(data=reflection_attacks, x='HourTime', y='Flow_Bytes_s_avg', marker='o',
                 label='Flow Bytes - Reflection Attack')
    sns.lineplot(data=df, x='HourTime', y='Flow_Bytes_s_avg', marker='o', label='Flow Packets - All comunication')
    sns.lineplot(data=df, x='HourTime', y='Flow_Bytes_s_avg', marker='o', label='Flow Bytes - All comunication')
    plt.title("Flow Packets and Bytes Over Time During Reflection Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packets/s and Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Bwd and Fwd Flow Packets Over Time
    plt.figure(figsize=(12, 6))
    # sns.lineplot(data=reflection_attacks, x='HourTime', y='Bwd_sum', marker='o', label='Bwd Packet - Reflection Attack')
    # sns.lineplot(data=reflection_attacks, x='HourTime', y='Fwd_sum', marker='o', label='Fwd Packet - Reflection Attack')
    sns.lineplot(data=df, x='HourTime', y='Bwd_sum', marker='o', label='Bwd Packets - All comunication')
    sns.lineplot(data=df, x='HourTime', y='Fwd_sum', marker='o', label='Fwd Packet - All comunication')
    plt.title("Bwd and Fwd Flow Packets Over Time During Reflection Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packets/s and Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    '''plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='Dst IP', y='CountSrc_uniq', palette='coolwarm')
    plt.title("Unique Source IPs Per Destination\nDuring Reflection Attacks")
    plt.xlabel("Destination IP")
    plt.ylabel("Unique Source Count")
    plt.xticks(rotation=45)
    plt.show()'''

    '''# Violin plot for FwdPacket_count_sum
    plt.figure(figsize=(8, 6))
    sns.violinplot(data=reflection_attacks, y='FwdPacket_count_sum')
    plt.title("Forward Packet Count Distribution During Reflection Attacks")
    plt.ylabel("Forward Packet Count")
    plt.show()

    # Violin plot for BwdPacket_count_sum
    plt.figure(figsize=(8, 6))
    sns.violinplot(data=reflection_attacks, y='BwdPacket_count_sum')
    plt.title("Backward Packet Count Distribution During Reflection Attacks")
    plt.ylabel("Backward Packet Count")
    plt.show()'''

    '''plt.scatter(df.Bwd_sum, df.Fwd_sum, color=["purple","green"], alpha=0.3)
    plt.scatter(reflection_attacks.Bwd_sum, reflection_attacks.Fwd_sum, color="purple", alpha=0.3)
    plt.title("Bwd and Fwd Packet count correlation")
    plt.xlabel("Bwd Packets")
    plt.ylabel("Fwd Packets")
    plt.show()'''

    ''' plt.scatter(df.Flow_Packets_s_avg, df.Flow_Bytes_s_avg, color="purple", alpha=0.3)
    plt.title("Flow_Packets and Flow_Bytes (Payload) correlation")
    plt.xlabel("Flow Packets")
    plt.ylabel("Flow Bytes")
    plt.show()'''

In [16]:
def SYN_Flood(df_of_attack, df):
    # Ensuring the filter data for SYN Flood Attack for graphs 
    SYN_Flood = df_of_attack[df_of_attack['attack_type'] == 'SYN_Flood']
    # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
    if not np.issubdtype(df['HourTime'].dtype, np.datetime64):
        df['HourTime'] = df['Datetime'].dt.floor('30min').dt.strftime('%H:%M')
    # Sort Reflection Attack data by 'HourTime'
    SYN_Flood = SYN_Flood.sort_values(by='Dst IP')
    SYN_Flood = SYN_Flood.sort_values(by='HourTime')
    
    # Plot Flow_Packets_s_avg over time
    plt.figure(figsize=(18, 10))
    sns.lineplot(data=df, x='HourTime', y='Flow_Packets_s_avg', marker='o')
    sns.lineplot(data=SYN_Flood, x='HourTime', y='Flow_Packets_s_avg', marker='o', color='orange')
    plt.title("Flow Packets Per Hour During SYN Flood Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packet/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Flow_Bytes_s_avg over time
    plt.figure(figsize=(18, 10))
    sns.lineplot(data=df, x='HourTime', y='Flow_Bytes_s_avg', marker='o')
    sns.lineplot(data=SYN_Flood, x='HourTime', y='Flow_Bytes_s_avg', marker='o', color='orange')
    plt.title("Flow Bytes Per Hour During SYN Flood Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Flow_Bytes vs Flow_Packets Over Time
    plt.figure(figsize=(18, 10))
    sns.lineplot(data=df, x='HourTime', y='Flow_Packets_s_avg', marker='o', label='Flow Packets - All comunication')
    sns.lineplot(data=df, x='HourTime', y='Flow_Bytes_s_avg', marker='o', label='Flow Bytes - All comunication')
    plt.title("Flow Packets vs Flow Bytes Over Time During SYN Flood Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packets/s and Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Bar chart showing CountRequests over time
    plt.figure(figsize=(18, 10))
    sns.barplot(data=df, x='HourTime', y='CountRequests', errorbar=None)
    plt.title("Count of Requests Over Time During SYN Flood Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Count of Requests")
    plt.xticks(rotation=45)
    plt.show()

    # Bar chart showing SYN_count_sum over time
    plt.figure(figsize=(18, 10))
    sns.barplot(data=df, x='HourTime', y='SYN_count_sum', errorbar=None)
    plt.title("SYN Count Over Time")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Count of SYN Flags During SYN Flood Attacks")
    plt.xticks(rotation=45)
    plt.show()

    '''# Violin plot for ACK_count_sum
    plt.figure(figsize=(18, 10))
    sns.violinplot(data=df, x='HourTime', y='ACK_count_sum')
    sns.violinplot(data=SYN_Flood, x='HourTime', y='ACK_count_sum')
    plt.title("ACK Flag Packet Count Distribution During SYN Flood Attack")
    plt.ylabel("ACK Flag Packet Count")
    plt.show()

    # Violin plot for SYN_count_sum
    plt.figure(figsize=(18, 10))
    sns.violinplot(data=df, x='HourTime', y='SYN_count_sum')
    sns.violinplot(data=SYN_Flood, x='HourTime', y='SYN_count_sum')
    plt.title("SYN Packet Count Distribution During SYN Flood Attack")
    plt.ylabel("SYN Flag Packet Count")
    plt.show()
    # Violin plot for SYN_count_sum
    plt.figure(figsize=(18, 10))
    sns.violinplot(data=df, x='HourTime', y='SYN_count_sum')
    plt.title("SYN Packet Count Distribution During SYN Flood Attack /n +SYN_Flood")
    plt.ylabel("SYN Flag Packet Count")
    plt.show()'''

    # Plot SYN vs ACK Flow Packets Over Time
    # SYN vs ACK normalized
    plt.figure(figsize=(18, 10))
    # sns.lineplot(data=reflection_attacks, x='HourTime', y='Bwd_sum', marker='o', label='SYN flag sum - SYN Flood Attack')
    # sns.lineplot(data=reflection_attacks, x='HourTime', y='Fwd_sum', marker='o', label='ACK flag sum - SYN Flood Attack')
    sns.lineplot(data=df, x='HourTime', y='Normalized_SYN_count_sum', marker='o', label='SYN flag sum - All comunication')
    sns.lineplot(data=df, x='HourTime', y='Normalized_ACK_count_sum', marker='o', label='ACK flag sum - All comunication')
    plt.title("Normalized SYN Flag Sum vs ACK Flag Sum Over Time During SYN Flood Attack")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("SYN and ACK flags")
    plt.xticks(rotation=45)
    plt.show()
    # SYN vs ACK
    plt.figure(figsize=(18, 10))
    # sns.lineplot(data=reflection_attacks, x='HourTime', y='Bwd_sum', marker='o', label='SYN flag sum - SYN Flood Attack')
    # sns.lineplot(data=reflection_attacks, x='HourTime', y='Fwd_sum', marker='o', label='ACK flag sum - SYN Flood Attack')
    sns.lineplot(data=df, x='HourTime', y='SYN_count_sum', marker='o', label='SYN flag sum - All comunication')
    sns.lineplot(data=df, x='HourTime', y='ACK_count_sum', marker='o', label='ACK flag sum - All comunication')
    plt.title("SYN Flag Sum vs ACK Flag Sum Over Time During SYN Flood Attack")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("SYN and ACK flags")
    plt.xticks(rotation=45)
    plt.show()

    plt.figure(figsize=(12, 6))
    # Define colors with Alpha channel integrated
    color_syn = (0.5, 0, 0.5, 0.8)  # Purple with 80% opacity
    color_ack = (1, 0.5, 0, 0.5)  # Green with 30% opacity

    # Create box plots with these colors and line settings
    sns.boxplot(data=dos_attacks_detected_group, x='HourTime', y='SYN_count_sum', color=color_syn, linecolor=color_syn,
                linewidth=3)
    sns.boxplot(data=dos_attacks_detected_group, x='HourTime', y='ACK_count_sum', color=color_ack, linecolor=color_ack,
                linewidth=3)

    plt.title("SYN vs ACK Flags Distribution During SYN Flood Attacks")
    plt.ylabel("SYN / ACK Frequency")
    plt.xticks(rotation=45)
    plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
    plt.tight_layout()
    plt.show()
    
    plt.scatter(data=dos_attacks_detected_group, x='SYN_count_sum', y='ACK_count_sum', color="purple", alpha=0.3)
    plt.xlabel("SYN Frequency")
    plt.ylabel("ACK Frequency")
    plt.title("SYN and ACK Flags correlation During SYN Flood Attacks")
    plt.xticks(rotation=45)
    plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
    plt.tight_layout()
    plt.show()

    '''plt.figure(figsize=(12, 6))
    sns.lineplot(data = df, x=df.index, y='SYN_count_sum', marker='o', label='Data')

    outlier_info = detect_outliers_iqr(df, "SYN_count_sum")
    upper_bound = outlier_info["upper_bound"]
    lower_bound = outlier_info["lower_bound"]
    plt.axhline(y=upper_bound, color='red', linestyle='--', label='Upper Bound (95th Percentile)')
    plt.axhline(y=lower_bound, color='blue', linestyle='--', label='Lower Bound (5th Percentile)')

    plt.title("SYN Count Over Time with Outlier Thresholds")
    plt.xlabel('Time')
    plt.ylabel("SYN_count_sum")
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.show()'''



In [17]:
def ACK_Flood(df_of_attack, df):
    # Filter data for ACk Flood Attack for graphs
    ACK_Flood = df_of_attack[df_of_attack['attack_type'] == 'ACK_Flood']
    # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
    if not np.issubdtype(df['HourTime'].dtype, np.datetime64):
        df['HourTime'] = df['Datetime'].dt.floor('30min').dt.strftime('%H:%M')
    # Sort Reflection Attack data by 'HourTime'
    ACK_Flood = ACK_Flood.sort_values(by='Dst IP')
    ACK_Flood = ACK_Flood.sort_values(by='HourTime')

    # Plot Flow_Packets_s_avg over time
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=ACK_Flood, x='HourTime', y='Flow_Packets_s_avg', marker='o')
    plt.title("Flow Packets Per Hour During ACK Flood Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Packets/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Plot Flow_Bytes_s_avg over time
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=ACK_Flood, x='HourTime', y='Flow_Bytes_s_avg', marker='o')
    plt.title("Flow Bytes Per Hour During ACK Flood Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Flow Byte/s (Average)")
    plt.xticks(rotation=45)
    plt.show()

    # Prepare data for heatmap
    heatmap_data = ACK_Flood.pivot_table(index='HourTime', values='CountRequests', aggfunc='sum', fill_value=0)
    plt.figure(figsize=(12, 6))
    sns.heatmap(heatmap_data, cmap="YlGnBu", annot=True, fmt="d", linewidths=0.5)
    plt.title("Heatmap of Request Counts During ACK Flood Attacks")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Request Count")
    plt.show()

    # Bar chart showing CountRequests over time
    plt.figure(figsize=(12, 6))
    sns.barplot(data=ACK_Flood, x='HourTime', y='CountRequests', errorbar=None)
    plt.title("ACK Flood Attack Count Over Time")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Count of Requests")
    plt.xticks(rotation=45)
    plt.show()

    # Bar chart showing ACK_count_sum over time
    plt.figure(figsize=(12, 6))
    sns.barplot(data=ACK_Flood, x='HourTime', y='ACK_count_sum', errorbar=None)
    plt.title("ACK Flood ACK Count Over Time")
    plt.xlabel("Time (Hour:Minute)")
    plt.ylabel("Count of ACK Flags")
    plt.xticks(rotation=45)
    plt.show()

    # Histogram for ACK_SYN_Ratio
    plt.figure(figsize=(8, 6))
    sns.histplot(ACK_Flood['ACK_SYN_Ratio'], bins=20, kde=True, color='blue')
    plt.axvline(ACK_Flood['ACK_SYN_Ratio'].quantile(0.75), color='red', linestyle='--', label="Upper Threshold")
    plt.axvline(ACK_Flood['ACK_SYN_Ratio'].quantile(0.25), color='green', linestyle='--', label="Lower Threshold")
    plt.title("Histogram of ACK_SYN_Ratio During ACK Flood Attacks")
    plt.xlabel("ACK_SYN_Ratio")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()

    # Violin plot for ACK_count_sum
    plt.figure(figsize=(8, 6))
    sns.violinplot(data=ACK_Flood, y='ACK_count_sum')
    plt.title("ACK Flag Packet Count Distribution During ACK Flood Attacks")
    plt.ylabel("ACK Flag Packet Count")
    plt.show()

    # Violin plot for SYN_count_sum
    plt.figure(figsize=(8, 6))
    sns.violinplot(data=ACK_Flood, y='SYN_count_sum')
    plt.title("SYN Packet Count Distribution During ACK Flood Attacks")
    plt.ylabel("SYN Flag Packet Count")
    plt.show()

    plt.figure(figsize=(10, 6))
    # Define colors with Alpha channel integrated
    color_syn = (0.5, 0, 0.5, 0.5)  # Purple with 30% opacity
    color_ack = (1, 1, 0, 0.5)  # Yellow with 30% opacity

    # Create box plots with these colors and line settings
    sns.boxplot(data=dos_attacks_detected_group, x='HourTime', y='SYN_count_sum', color=color_syn, linecolor="orange",
                linewidth=3)
    sns.boxplot(data=dos_attacks_detected_group, x='HourTime', y='ACK_count_sum', color=color_ack, linecolor="purple",
                linewidth=3)

    plt.title("SYN vs ACK Flags Distribution During ACK Flood Attacks")
    plt.ylabel("SYN / ACK Frequency")
    plt.show()

    plt.scatter(data=dos_attacks_detected_group, x='SYN_count_sum', y='ACK_count_sum', color="purple", alpha=0.3)
    plt.xlabel("SYN Frequency")
    plt.ylabel("ACK Frequency")
    plt.title("SYN and ACK Flags correlation During ACK Flood Attacks")
    plt.show()

In [18]:
def detect_DoS_attacks(df, normal_data=None):
    groupData = group_data(df)
    groupNormData = None
    if not normal_data.empty:
        groupNormData = group_data(normal_data)
    # Iterating over groups to detect potential DoS attacks and classify attacks
    dos_attacks = {}
    for index, row in groupData.iterrows():
        classify_attack(row, df._filename, dos_attacks, index, groupNormData)

    return dos_attacks, groupData, groupNormData

In [19]:
def plot_line_data(data, data2=None, x=None, y=None, hue=None, title="", xlabel="", ylabel="", rotation=45,
                   figsize=(12, 6), color='skyblue'):
    flag = False
    if isinstance(data, pd.Series):
        data.value_counts().plot(kind='bar', color=color)
    elif isinstance(data, pd.DataFrame) and 'x' in data.columns and 'y' in data.columns:
        if data2 is not None:
            if isinstance(data2, type(data)):
                sns.lineplot(data=data2, x=x, y=y, hue=hue, marker='o')
            sns.lineplot(data=data, x='x', y='y', color=color, marker='o')
    if flag:
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.xticks(rotation=45)
        plt.show()
    else:
        print("Error: Data format not supported or incorrect parameters.")

In [20]:
def plot_bar_data(data, title, xlabel, ylabel, color='skyblue'):
    plt.figure(figsize=(10, 6))
    data.plot(kind='bar', color=color)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.show()

In [21]:
def perform_ports_protocols_graphs_fromGroupData(df, DosAttacks=None):
    if 'SrcPort_categorical' in df.columns and 'Protocol_categorical' in df.columns:
        port_category_counts = df[['Well_Known_Port_Count', 'Registered_Port_Count', 'Dynamic_Private_Port_Count']].sum()
        protocol_category_counts = df[['Protocol_TCP_Count', 'Protocol_UDP_Count', 'Protocol_ICMP_Count', 'Other_Protocol_Count']].sum()

        plt.figure(figsize=(10, 6))
        port_category_counts.plot(kind='bar', color=['blue', 'orange', 'green'])
        plt.title("Distribution of Port Categories")
        plt.xlabel("Port Category")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.show()

        plt.figure(figsize=(10, 6))
        protocol_category_counts.plot(kind='bar', color=['red', 'purple', 'cyan', 'pink'])
        plt.title("Distribution of Protocol Categories")
        plt.xlabel("Protocol Category")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.show()

        if DosAttacks is not None and 'SrcPort_categorical' in DosAttacks.columns and 'Protocol_categorical' in DosAttacks.columns:
            attack_port_counts = DosAttacks[['Well_Known_Port_Count', 'Registered_Port_Count', 'Dynamic_Private_Port_Count']].sum()
            attack_protocol_counts = DosAttacks[['Protocol_TCP_Count', 'Protocol_UDP_Count', 'Protocol_ICMP_Count', 'Other_Protocol_Count']].sum()

            plt.figure(figsize=(10, 6))
            attack_port_counts.plot(kind='bar', color=['blue', 'orange', 'green'])
            plt.title("Port Category Distribution in Detected Attacks")
            plt.xlabel("Port Category")
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.show()

            plt.figure(figsize=(10, 6))
            attack_protocol_counts.plot(kind='bar', color=['red', 'purple', 'cyan', 'pink'])
            plt.title("Protocol Category Distribution in Detected Attacks")
            plt.xlabel("Protocol Category")
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.show()
    else:
        print("Required columns are missing from DataFrame.")

In [22]:
def attack_det(df_of_attack, str_type = None):
    """
    Analyze and visualize Attacks.

    This function focuses on classified attacks and provides detailed descriptions and visualizations of the unique sources,
    their ports, and protocols.
    """
    # Format 'HourTime' as a datetime column for sorting
    if not np.issubdtype(df_of_attack['HourTime'].dtype, np.datetime64):
        df_of_attack['HourTime'] = df_of_attack['Datetime'].dt.floor('30min').dt.strftime('%H:%M')
    # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
    df_of_attack_type = df_of_attack.sort_values(by='Dst IP')
    df_of_attack_type = df_of_attack_type.sort_values(by='HourTime')
    if str_type is not None:
        # Filter data for Attack Type as in the input
         if str_type is str:
            df_of_attack_type = df_of_attack_type[df_of_attack_type['attack_type'] == str_type]
    print(f"{str_type} Detected:") 
    # Iterate over each Attack for detailed analysis
    for index, attack in df_of_attack_type.iterrows():
            print(f"Attack ID: {index}")
            describe_df(attack)
    return df_of_attack_type  

In [23]:
def perform_attacks(df, normal_data=None):
    # how many of each attack:
    n_SYN_Flood = 0
    n_Reflection_Attack = 0
    n_ACK_Flood = 0
    detected_attacks_dict, detected_group, groupNormData = detect_DoS_attacks(df, normal_data)

    # Display descriptive statistics
    print("Grouped Data Description:")
    describe_df(detected_group)
    
    if groupNormData is not None:
        if not groupNormData.empty:
            # Display descriptive statistics
            print("Normal Data to Compare to:")
            print("Grouped Normal Data Description:")
            describe_df(groupNormData)
            
    for attack in detected_attacks_dict.values():
        if attack['attack_type'] == 'SYN_Flood':
            n_SYN_Flood += 1
        elif attack['attack_type'] == 'Reflection_Attack':
            n_Reflection_Attack += 1
        elif attack['attack_type'] == 'ACK_Flood':
            n_ACK_Flood += 1
    print(f"SYN_Flood: {n_SYN_Flood}\nReflection_Attack: {n_Reflection_Attack}\nACK_Flood: {n_ACK_Flood}")
    print()

    if detected_attacks_dict is None:
        print("No DoS attack has been detected.")
        return

    # To perform a statistic description of a dictionary, we need to convert it into a pd Series or DataFrame:
    # Convert to DataFrame
    attacks_df = pd.DataFrame.from_dict(detected_attacks_dict, orient='index')
    # Display basic statistics
    describe_df(attacks_df)
    print(attacks_df.head())

    #    print(
    #        f"\nAttacks details in original grouped df:\n{detected_group[detected_group.index.map(lambda x: x in attacks_df.index)]}\n\n")

    # Display descriptive statistics and perform analysis and plotting for each attack type
    if n_SYN_Flood > 0:
        SYN_Flood(attack_det(attacks_df, "SYN_Flood"), detected_group)
    if n_Reflection_Attack > 0:
        Reflection_Attack(attack_det(attacks_df,"Reflection_Attack"), detected_group)
    if n_ACK_Flood > 0:
        ACK_Flood(attack_det(attacks_df,"ACK_Flood"), detected_group)
    if n_SYN_Flood == n_Reflection_Attack == n_ACK_Flood == 0:
        print("No DoS attacks have been recognized.")
   
    return detected_attacks_dict, detected_group, groupNormData

In [None]:
dos_attacks_dict, dos_attacks_detected_group, normal_data_group = perform_attacks(df_tcpdump_friday, df_pvt_monday)

In [None]:
graphs_for_outliers(dos_attacks_detected_group, normal_data_group,pd.DataFrame.from_dict(dos_attacks_dict, orient='index'))

In [27]:
perform_ports_protocols_graphs_fromGroupData(dos_attacks_detected_group,
                                             pd.DataFrame.from_dict(dos_attacks_dict, orient='index'))

NameError: name 'dos_attacks_detected_group' is not defined

In [26]:
# extra check 
max_syn_count = dos_attacks_detected_group['SYN_count_sum'].max()
print(f"Max SYN Count: {max_syn_count}")
print(dos_attacks_detected_group[dos_attacks_detected_group['SYN_count_sum'] == max_syn_count])

NameError: name 'dos_attacks_detected_group' is not defined

In [1]:
def graph_for_outlierCol_seperated(df, outlier_dict, normData=None, outlierNormal_dict=None, attack=None):
    column = outlier_dict['column']
    # Ensure the column exists in the DataFrame
    if column in df.columns:
        # Calculate the necessary statistics for the column
        col_mid = df[column].median()
        # col_std = df[column].std()
        lower_bound = outlier_dict['lower_bound']
        upper_bound = outlier_dict['upper_bound']

        # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
        if not np.issubdtype(df['HourTime'].dtype, np.datetime64):
            df['HourTime'] = df['Datetime'].dt.floor('30min').dt.strftime('%H:%M')

        # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
        df = df.sort_values(by='Dst IP')
        df = df.sort_values(by='HourTime')

        # Plot the data with horizontal lines for thresholds
        plt.figure(figsize=(12, 6))
        sns.lineplot(data=df, x='HourTime', y=column, marker='o', color='y', label='Analysed Data')
        # Add horizontal lines to indicate statistical thresholds
        plt.axhline(y=col_mid, color='blue', linestyle='dashed', linewidth=2, label='Median')
        # plt.axhline(y=col_mid - col_std, color='sky blue', linestyle='dashed', linewidth=2, label='-1 STD')
        # plt.axhline(y=col_mid + col_std, color='sky blue', linestyle='dashed', linewidth=2, label='+1 STD')
        plt.axhline(y=lower_bound, color='y', linestyle='dashed', linewidth=2, label='Lower Bound')
        plt.axhline(y=upper_bound, color='y', linestyle='dashed', linewidth=2, label='Upper Bound')
        # Add titles, labels, and legend for clarity
        plt.title(f"{column} Over Time with Horizontal Thresholds", fontsize=14,
                  fontweight='bold')
        plt.xlabel("Time (Hour:Minute)", fontsize=12)
        plt.ylabel(f"{column} Values", fontsize=12)
        plt.xticks(df['HourTime'].unique(), rotation=45)
        plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
        plt.tight_layout()
        plt.show()

        plt.figure(figsize=(12, 6))
        # Normal data
        if not normData.empty:
            # Ensure the column exists in the DataFrame
            if column in normData.columns:
                # Calculate the necessary statistics for the column
                n_col_mid = normData[column].median()
                # n_col_std = normData[nColumn].std()
                lower_nBound = outlierNormal_dict['lower_bound']
                upper_nBound = outlierNormal_dict['upper_bound']
                # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
                if not np.issubdtype(normData['HourTime'].dtype, np.datetime64):
                    normData['HourTime'] = normData['Datetime'].dt.floor('15T').dt.strftime('%H:%M')
                # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
                normData = normData.sort_values(by='Dst IP')
                normData = normData.sort_values(by='HourTime')
                sns.lineplot(data=normData, x='HourTime', y=column, marker='o', color='c', label='Normal Data')
                # Add horizontal lines to indicate statistical thresholds
                plt.axhline(y=n_col_mid, color='g', linestyle='dashed', linewidth=2, label='Normal Median')
                # plt.axhline(y=n_col_mid - n_col_std, color='gray', linestyle='dashed', linewidth=2, label='-1 Normal STD')
                # plt.axhline(y=n_col_mid + n_col_std, color='gray', linestyle='dashed', linewidth=2, label='+1 Normal STD')
                plt.axhline(y=lower_nBound, color='r', linestyle='dashed', linewidth=2, label='Lower Normal Bound')
                plt.axhline(y=upper_nBound, color='r', linestyle='dashed', linewidth=2, label='Upper Normal Bound')
                # Add titles, labels, and legend for clarity
                plt.title(f"{column} Over Time with Horizontal Thresholds:\nNormal Data vs Compared Data", fontsize=14,
                          fontweight='bold')
                plt.xlabel("Time (Hour:Minute)", fontsize=12)
                plt.ylabel(f"{column} Values", fontsize=12)
                plt.xticks(rotation=45)
                plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
                plt.tight_layout()
                plt.show()



        # --- Notes for Horizontal Lines ---
        # Horizontal lines highlight statistical thresholds (median, ±std, and bounds) on the y-axis.
        # This is useful to identify patterns or anomalies in the column's value distribution over time.

        # Plot the data with vertical lines for specific thresholds (time-based visualization)

        # Add vertical lines for thresholds

        # --- Notes for Vertical Lines ---
        # Vertical lines are used to highlight specific time points (e.g., first and last recorded times).
        # These are useful for marking events or key time intervals on the x-axis.

    else:
        return None

In [2]:
graph_for_outlierCol_seperated(dos_attacks_detected_group, normal_data_group,
                   attack_det(pd.DataFrame.from_dict(dos_attacks_dict, orient='index')))


NameError: name 'graphs_for_outliers' is not defined

In [93]:
def boxplot_for_outlierCol(df, column, normData=None):
    # Ensure the column exists in the DataFrame
    if column in df.columns:
        # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
        if not pd.api.types.is_datetime64_any_dtype(df['HourTime']):
            df['HourTime'] = pd.to_datetime(df['Datetime']).dt.floor('30min').dt.strftime('%H:%M')
        # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
        df = df.sort_values(by='Dst IP')
        df = df.sort_values(by='HourTime')

        #Boxplot
        plt.figure(figsize=(18, 10))
        sns.boxplot(data=df, x='HourTime', y=column, color="purple", linecolor="purple",linewidth=3, label = "Analysed Data")
        # Add titles, labels, and legend for clarity
        plt.title(f"{column} Distribution Over Time")
        plt.xlabel("Time (Hour:Minute)")
        plt.ylabel(f"{column} Values")
        plt.xticks(rotation=45)
        plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
        plt.show()

        plt.figure(figsize=(12, 6))
        # Normal data
        if not normData.empty:
            # Ensure the column exists in the DataFrame
            if column in normData.columns:
                flag = True
                # Ensure 'HourTime' is properly formatted as datetime for sorting and plotting
                if not pd.api.types.is_datetime64_any_dtype(df['HourTime']):
                    normData['HourTime'] = pd.to_datetime(normData['Datetime']).dt.floor('30min').dt.strftime('%H:%M')
                # Sort grouped data by 'Dst IP' and 'HourTime' to ensure the correct order
                normData = normData.sort_values(by='Dst IP')
                normData = normData.sort_values(by='HourTime')

                #Boxplot
                sns.boxplot(data=df, x='HourTime', y=column, color="purple", linecolor="purple", linewidth=3, label = "Compared Data")
                sns.boxplot(data=normData, x='HourTime', y=column, color="orange", linecolor="orange", linewidth=3, label = "Normal Data")
                # Add titles, labels, and legend for clarity
                plt.title(f"{column} Distribution Over Time")
                plt.xlabel("Time (Hour:Minute)")
                plt.ylabel(f"{column} Values")
                plt.xticks(rotation=45)
                plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
                plt.show()

    else:
        return None


In [74]:
def boxGraphs_for_outliers(df, normal_data=None):
    flag = True
    zscore_N_dict = None
    outlier_N_dict = None
    for col in df.columns:
        for _, row in df.iterrows():
            if (row.get(f'Outlier_Normalized_{col}_U/L') or row.get(f'Outlier_{col}_U/L')) and flag:
                # Transform the dictionaries into DataFrames
                zscore_dict = detect_zscore_outliers_iqr(df, col)
                outlier_dict = detect_outliers_iqr(df, col)
                df1 = pd.DataFrame.from_dict(zscore_dict, orient='index', columns=['zscore_values'])
                df2 = pd.DataFrame.from_dict(outlier_dict, orient='index', columns=['outlier_values'])
                # Combine the DataFrames (e.g., vertically or horizontally)
                # Horizontally: Adding columns
                combined_df_horizontal = pd.concat([df1, df2], axis=1)
                if normal_data.empty:
                    print(combined_df_horizontal)
                    flag = False
                else:
                    if row[col] > detect_zscore_outliers_iqr(normal_data, col)['upper_bound'] or row[
                        f"Normalized_{col}"] > \
                            detect_outliers_iqr(normal_data, col)['upper_bound'] or row[col] < \
                            detect_zscore_outliers_iqr(normal_data, col)['lower_bound'] or row[f"Normalized_{col}"] < \
                            detect_outliers_iqr(normal_data, col)['lower_bound']:
                        print(combined_df_horizontal)
                        zscore_N_dict = detect_zscore_outliers_iqr(df, col)
                        outlier_N_dict = detect_outliers_iqr(df, col)
                        flag = False
                boxplot_for_outlierCol(df, col, normal_data)

        if not flag:
            print(f"No Outlier values for {col} in this DF")
        flag = True


In [None]:
boxGraphs_for_outliers(dos_attacks_detected_group, normal_data_group)
