In [4]:
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from os import listdir
from os.path import isfile, join

# Define the scenario and the data path accordingly
scenario = 'deterministic'  # deterministic, random, or wild
if scenario == 'random':
    mypath = './Data/Randomized Automated Data'
elif scenario == 'deterministic':
    mypath = 'C:/Users/Lenovo/Documents/mini-project/UTMobileNet2021/Deterministic Automated Data'
elif scenario == 'wild':
    mypath = 'C:/Users/Lenovo/Documents/mini-project/UTMobileNet2021/Wild Test Data'
else:
    raise NameError('Dataset Not Supported')

# List all files in the specified directory
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

# Extract unique apps and actions
apps = np.unique([f.split('_')[0] for f in onlyfiles])
print(apps, len(apps), len(onlyfiles))
app_actions = np.unique(['_'.join(f.split('_')[:2]) for f in onlyfiles])
print(app_actions, len(app_actions))

# Select applications and organize files
sel_apps = apps
sel_app_files = {app: [] for app in sel_apps}

for fname in onlyfiles:
    app_name = fname.split('_')[0]
    if app_name in sel_apps:
        sel_app_files[app_name].append(fname)

# Optional: Convert the dictionary to a DataFrame for better manipulation later
# This part is optional, depending on how you plan to use sel_app_files later
sel_app_files_df = pd.DataFrame.from_dict(sel_app_files, orient='index').transpose()
print(sel_app_files_df)

['dropbox' 'facebook' 'gmail' 'google-drive' 'google-maps' 'hangout'
 'hulu' 'instagram' 'messenger' 'netflix' 'pinterest' 'reddit' 'spotify'
 'twitter' 'youtube'] 15 3438
['dropbox_download' 'dropbox_upload' 'facebook_scroll-newsfeed'
 'facebook_search-page' 'gmail_open-email' 'google-drive_download'
 'google-drive_upload' 'google-maps_browse' 'google-maps_directions'
 'google-maps_download-map' 'google-maps_explore' 'hangout_hangout'
 'hulu_scroll-home' 'hulu_watch-video' 'instagram_IgSearchBrowse'
 'messenger_send-message' 'netflix_browse-home' 'netflix_watch-video'
 'pinterest_tap-board' 'reddit_browse' 'reddit_post' 'spotify_play-music'
 'spotify_search-music' 'twitter_post-tweet' 'twitter_scroll-feed'
 'youtube_catSearch' 'youtube_play-video'] 27
                                               dropbox  \
0    dropbox_download_2019-03-16_10-50-30_4fd1c357.csv   
1    dropbox_download_2019-03-16_10-50-30_5bd0c615.csv   
2    dropbox_download_2019-03-16_11-12-14_5bd0c615.csv   
3    

In [2]:
import numpy as np
import pandas as pd

# Define flow columns
flow_columns = ['ip.src', 'srcport', 'ip.dst', 'dstport', 'protocol']

def get_protocol(row):
    if not pd.isnull(row['tcp.len']):
        return 'TCP'
    elif not pd.isnull(row['udp.length']):
        return 'UDP'
    else:
        return 'Unknown'
    
def get_src_port(row):
    if not pd.isnull(row['tcp.len']):
        return row['tcp.srcport']
    elif not pd.isnull(row['udp.length']):
        return row['udp.srcport']
    else:
        return 'Unknown'
    
def get_dst_port(row):
    if not pd.isnull(row['tcp.len']):
        return row['tcp.dstport']
    elif not pd.isnull(row['udp.length']):
        return row['udp.dstport']
    else:
        return 'Unknown'
    
columns = [
    'frame.number', 'frame.time', 'frame.len', 'frame.cap_len', 'ip.hdr_len',
    'ip.dsfield.ecn', 'ip.len', 'ip.frag_offset', 'ip.ttl', 'ip.proto',
    'ip.src', 'ip.dst', 'tcp.hdr_len', 'tcp.len', 'tcp.srcport',
    'tcp.dstport', 'tcp.flags.ns', 'tcp.flags.fin', 'tcp.window_size_value',
    'tcp.urgent_pointer', 'tcp.option_kind', 'tcp.option_len',
    'udp.srcport', 'udp.dstport', 'udp.length'
]

def compute_flow_features(df):
    flow_features = {}
    flow_features['total_num_pkts'] = len(df)
    pkt_size = df['ip.len'].astype(float)
    flow_features['total_num_bytes'] = pkt_size.sum()
    flow_features['min_pkt_size'] = pkt_size.min()
    flow_features['max_pkt_size'] = pkt_size.max()
    flow_features['mean_pkt_size'] = pkt_size.mean()
    flow_features['std_pkt_size'] = pkt_size.std()

    # df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True))
    # df['frame.time'] = df['frame.time'].dt.tz_localize('America/Chicago')
    # iat = df['frame.time'].diff(1).dt.total_seconds().iloc[1:]

    df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
    df['frame.time'] = df['frame.time'].dt.tz_localize('America/Chicago', ambiguous='NaT', nonexistent='NaT')
    iat = df['frame.time'].diff().dt.total_seconds().iloc[1:]
    
    flow_features['min_iat'] = iat.min()
    flow_features['max_iat'] = iat.max()
    flow_features['mean_iat'] = iat.mean()
    flow_features['std_iat'] = iat.std()
    flow_features['dur'] = iat.sum()
    
    return flow_features

def process_df_by_flow(df):
    df['protocol'] = df.apply(get_protocol, axis=1)
    df['srcport'] = df.apply(get_src_port, axis=1)
    df['dstport'] = df.apply(get_dst_port, axis=1)
    
    df_flow = pd.DataFrame()
    ul_flows = {}
    dl_flows = {}

    for flow, flow_df in df.groupby(by=flow_columns): 
        if flow[0].split('.')[0] == '10': #if ip.src starts with 10 then
            ul_flows[flow] = compute_flow_features(flow_df)
        else:
            dl_flows[flow] = compute_flow_features(flow_df)

    for ul_flow, ul_flow_features in ul_flows.items():
        for dl_flow, dl_flow_features in dl_flows.items():
            if (ul_flow[0] == dl_flow[2] and # ip.add src of ul == ip.add dest of dl
                ul_flow[2] == dl_flow[0] and # ip.add dest of ul == ip.add src of dl
                ul_flow[1] == dl_flow[3] and # ip.port src of ul == ip.port dest of dl
                ul_flow[3] == dl_flow[1] and # ip.port dest of ul == ip.port src of dl
                ul_flow[4] == dl_flow[4]): # protocol is same for ul and dl
                
                ul_flow_features = {'ul_' + feature_name: feature for feature_name, feature in ul_flow_features.items()}
                dl_flow_features = {'dl_' + feature_name: feature for feature_name, feature in dl_flow_features.items()}
                bi_flow_features = {**ul_flow_features, **dl_flow_features}
                bi_flow_features['ip_A'] = ul_flow[0] #src
                bi_flow_features['port_A'] = ul_flow[1]
                bi_flow_features['ip_B'] = ul_flow[2] #dest
                bi_flow_features['port_B'] = ul_flow[3]
                bi_flow_features['protocol'] = ul_flow[4]

                df_flow = pd.concat([df_flow, pd.DataFrame([bi_flow_features])], ignore_index=True)

    return df_flow

def clean_up_duplicate(row):
    # row['ip.hdr_len'] = str(row['ip.hdr_len']).split(',')[-1]  # Keep the last one
    row['ip.hdr_len'] = float(str(row['ip.hdr_len']).split(',')[-1]) if str(row['ip.hdr_len']).split(',')[-1].isdigit() else np.nan
    row['ip.len'] = str(row['ip.len']).split(',')[0]  # Keep the first one
    row['ip.src'] = row['ip.src'].split(',')[0]  # Keep the first one
    row['ip.dst'] = row['ip.dst'].split(',')[0]  # Keep the first one
    return row

print("Done processing flows")

Done processing flows


In [3]:
df_all = pd.DataFrame()
for app in sel_apps:
    integrity = True
    df_app = pd.DataFrame()
    for fname in sel_app_files[app]:
        action = fname.split('_')[1]
        try:
            df = pd.read_csv(join(mypath, fname), usecols=columns, low_memory=False)
            df = df[df['ip.src'].notna()]
            df = df.apply(lambda row: clean_up_duplicate(row), axis=1)

            # Remove self loop pkts
            df = df[(df['ip.src'] != '127.0.0.1') & (df['ip.dst'] != '127.0.0.1')]

            df_flow = process_df_by_flow(df)
            df_flow['action'] = action
            df_app = pd.concat([df_app, df_flow], ignore_index=True)  # Use pd.concat here

        except Exception as e:
            integrity = False
            print(f'\n Error while processing {fname}. Error message: {str(e)} \n')

    df_app['app'] = app
    
    if integrity:
        df_all = pd.concat([df_all, df_app], ignore_index=True)  # Use pd.concat here

df_all.to_csv(f'./{scenario}_scenario_bi_flow_features.csv')
print('Finished processing {} scenario data.'.format(scenario))

  row['ip.len'] = str(row['ip.len']).split(',')[0]  # Keep the first one
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].str.replace(r' CDT$', '', regex=True), errors='coerce')
  df['frame.time'] = pd.to_datetime(df['frame.time'].st

Finished processing deterministic scenario data.


In [21]:
processed_df=pd.read_csv('wild_scenario_bi_flow_features.csv')
processed_df.columns

Index(['Unnamed: 0', 'ul_total_num_pkts', 'ul_total_num_bytes',
       'ul_min_pkt_size', 'ul_max_pkt_size', 'ul_mean_pkt_size',
       'ul_std_pkt_size', 'ul_min_iat', 'ul_max_iat', 'ul_mean_iat',
       'ul_std_iat', 'ul_dur', 'dl_total_num_pkts', 'dl_total_num_bytes',
       'dl_min_pkt_size', 'dl_max_pkt_size', 'dl_mean_pkt_size',
       'dl_std_pkt_size', 'dl_min_iat', 'dl_max_iat', 'dl_mean_iat',
       'dl_std_iat', 'dl_dur', 'ip_A', 'port_A', 'ip_B', 'port_B', 'protocol',
       'action', 'app'],
      dtype='object')