#**Homework**

Complete the following tasks:

* Use a dataset of 21 Video Sessions
* Recognize the Video Server(s) IP and select video traffic (***if more than one Server is found, keep the dominant flow only***)
* Detect Video Client HTTP Requests (Uplink packets with size larger or equal to 100 Bytes)
* Compute features to predict:
 1.   When the next UL Request is sent by the Video Client 
 2.   How large is the response of the Server to the next UL Request

**N.B.**: Below, you can find a list of useful functions for the tasks at hand (introduced during class).

In [199]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from math import sqrt
#import tensorflow as tf
#from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')

### Functions Ready-To-Use


In [106]:
def filter_traffic(data, domain):
   
    # Look in DNS Responses for googlevideo domain
    dns_data = data[data['Protocol']=='DNS']
    dns = dns_data[dns_data['Info'].apply(lambda x: 'googlevideo' in x and 'response' in x)]
    ips = dns.Address.values 
    server_names = dns.Name.values
    
    # Filtering on either "Source" or "Destination" IP, get the 
    # rows of the dataset that contain at least one of the selected IPs
    downlink = data[data['Source'].apply(lambda x: x in ips)].dropna(subset=['Length']) 
    uplink = data[data['Destination'].apply(lambda x: x in ips)].dropna(subset=['Length'])
    
    return ips, server_names, uplink, downlink

def find_dominant(uplink, downlink):

  # Expressed in MB

  # Order flows by cumulative DL Volume
  flows_DL = downlink.groupby(['Source','Destination'])['Length'].sum()/(10**6)
  print(flows_DL)
  
  # Get (Source,Destination) IPs of dominant flows
  dom_id = flows_DL[flows_DL==max(flows_DL)].index[0]

  # Filter traffic selecting the dominant flow
  dom_dl = downlink[downlink['Source']==dom_id[0]]
  dom_ul = uplink[(uplink['Destination']==dom_id[0])]

  return dom_ul, dom_dl

def timebased_filter(data, length=None, min_time=None, max_time=None):
  '''
  :param data: pd dataframe to be filtered. Must contain columns: "Length" and "Time"
  :param length: all packets shorter than length [Bytes] will be discarded (default 0)
  :param min_time: all packets with timestamp smaller than min_time [s] will be discarded (default 0)
  :param max_time: all packets with timestamp larger than max_time [s] will be discarded (default 1000)
  '''

  if length is None:
    length=0
  if min_time is None:
    min_time = 0
  if max_time is None:
    max_time = 1000
  
  filtered_data = data.copy().reset_index()
  mask = (filtered_data['Length']>=length) & (filtered_data['Time']>=min_time) & (filtered_data['Time']<= max_time)
  filtered_data = filtered_data.loc[mask[mask ==True].index]

  return filtered_data

def find_next(array, value):
  '''
  :param array: np.array, array of floats
  :param value: float, reference value
  :return: position of the closest element of the array greater than "value"
  '''
  delta = np.asarray(array) - value
  idx = np.where(delta >= 0, delta, np.inf).argmin()

  return idx

def normalize_dataset(training_set, test_set):

  mean_train = training_set.mean()
  std_train = training_set.std()
  norm_train = (training_set - mean_train)/std_train
  norm_test = (test_set - mean_train)/std_train  

  return norm_train, norm_test

### Functions to be completed


In [200]:
def features_extraction(uplink, downlink):
  '''
  Complete this function to extract both features and groundtruth.

  NB: The features extraction process is the same as the one introduced during
  the lecture. 
  '''

  dataset = pd.DataFrame(columns=['Request_Size','Inter_RR_Time','DL_Time','DL_Vol','DL_Size','PB_Time'])
  # ****************************************************************************
  # Feature 1: Client Request Size
  dataset['Request_Size'] = list(uplink.Length.values)

  # ****************************************************************************
  # Feature 2: Inter Request-Response Time
  rr_time = []
  response_time = []
  for t in uplink.Time:
    response_time.append(find_next(downlink.Time, t)) #index of next DL packet timestamp 
    rr_time.append(downlink.Time.iloc[response_time[-1]] - t)

  dataset['Inter_RR_Time'] = rr_time

  # ****************************************************************************
  # Feature 3-4-5: Download Time, Download Volume, Download Size (# Packets) 
  dt = []
  dv = []
  ds = []

  for rt1, rt2 in zip(response_time[:-1], response_time[1:]):

    #Download Time
    dt.append(downlink.Time.iloc[rt2-1] - downlink.Time.iloc[rt1])

    #temp = timebased_filter_v2(downlink, 0, downlink.Time.iloc[rt1], downlink.Time.iloc[rt2-1])
    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt1], downlink.Time.iloc[rt2-1])
    
    #Download Volume
    dv.append(temp.Length.sum())

    #Download Size (# Packets) 
    ds.append(temp.shape[0])

  # Last Iteration data might be corrupted due to drastic interruption of capture 
  # process. If it is so, an error would occur during the features extraction.
  # To avoid this, we skip last HTTP iteration data when an error is raised 
  # using the try-except logic below.
  try:
    # Consider also last HTTP iteration
    #Download Time
    dt.append(downlink.Time.iloc[-1] - downlink.Time.iloc[rt2])

    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt2], downlink.Time.iloc[-1])
    #Download Volume
    dv.append(temp.Length.sum())

    #Download Size (# Packets) 
    ds.append(temp.shape[0])
  except:
    print()

  dataset['DL_Time'] = dt
  dataset['DL_Vol'] = dv
  dataset['DL_Size'] = ds

  # ****************************************************************************
  # Feature 5: Playback Time
  pbt = list(uplink.Time.values)
  dataset['PB_Time'] = pbt
  # ****************************************************************************
  
  
  # Check Features Consistency
  dataset = dataset[(dataset > 0).all(1)]
  dataset = dataset[dataset['DL_Time']<20]

  ###############################################################
  # TO BE COMPLETED

  ### EXTRACT GROUNDTRUTH HERE
  groundtruth = pd.DataFrame(columns=['Next_Request_Time','Next_Response_Vol'])
  # ****************************************************************************

  # Extract ground truth information
  groundtruth = pd.DataFrame(columns=['Next_Request_Time','Next_Response_Vol'])

  # GT 1: Next Request Time
  #storing the calculated time differences between consecutive requests
  next_req_time = uplink.Time[1:].reset_index(drop=True) - downlink.Time.iloc[response_time[:-1]].reset_index(drop=True)
  
  # If the time for uplink is greater than time for downlink, it means that there was no response to the last request made by the client
  # In this case, we calculate the time difference between the last request and the end of the session and append it to the next_request_time series. 
  if len(uplink.Time) > len(downlink.Time):
    last_res_time = downlink.Time.iloc[response_time[-1]]
    last_ul_time = uplink.Time.iloc[-1]
    next_req_time = next_req_time.append(pd.Series([last_ul_time - last_res_time]))
  groundtruth['Next_Request_Time'] = next_req_time

# GT 2: Next Response Volume
  groundtruth['Next_Response_Vol'] = dataset['DL_Vol'].shift(periods=-1)

  ###############################################################


  # Check Ground Truth Consistency
  groundtruth.dropna(inplace=True)

  # Align Dataset and Groundtruth
  intersection = set(dataset.index).intersection(set(groundtruth.index))
  dataset = dataset.loc[intersection,:]
  groundtruth = groundtruth.loc[intersection,:]

  return dataset, groundtruth

### Write your code here - Dataset & GroundTruth Generation



In [173]:
import glob

In [None]:
!unzip /content/Captures.zip
filenames = glob.glob('/content/Captures/Capture_v2_*.csv')


In [201]:
import pandas as pd
# Create empty lists to hold the datasets and groundtruths
dataset_list = []
groundtruth_list = []
# timebased_filter function parameters
playback_start = 2
playback_end = 180
min_ul_size = 100
min_dl_size = 50

# Iterate over each filename in the list of filenames
for filename in filenames:

    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(filename, sep=',', encoding='latin-1')

    # Filter the DataFrame for traffic related to Google Video
    _,__,ul, dl = filter_traffic(df, 'googlevideo')

    # Find the dominant upstream and downstream flows for the filtered DataFrame
    # TCP flows have been filtered since are not relevant to googlevideo(do not correspond to actual data transferred)
    dom_ul, dom_dl = find_dominant(ul, dl)
    dom_ul = timebased_filter(dom_ul, min_ul_size, playback_start, playback_end)
    dom_ul = dom_ul[dom_ul['Protocol'] != 'TCP']
    dom_dl = timebased_filter(dom_dl, min_dl_size, playback_start, playback_end)
    
    # Extract features from the dominant upstream and downstream flows
    ds, gt = features_extraction(dom_ul, dom_dl)

    # Append the extracted dataset and ground truth to the respective lists
    dataset_list.append(ds)
    groundtruth_list.append(gt)

# Concatenate all dataset_list into a single DataFrame
dataset = pd.concat(dataset_list, ignore_index=True)

# Concatenate all ground truths into a single DataFrame
groundtruth = pd.concat(groundtruth_list, ignore_index=True)


Source          Destination
74.125.111.106  192.168.1.6     2.080363
91.81.217.141   192.168.1.6    32.572815
Name: Length, dtype: float64
Source           Destination
173.194.182.135  192.168.1.6    0.017631
74.125.99.168    192.168.1.6    0.199115
Name: Length, dtype: float64
Source           Destination
173.194.182.138  192.168.1.6    0.015734
173.194.187.136  192.168.1.6    1.495762
74.125.110.102   192.168.1.6    0.866399
74.125.160.202   192.168.1.6    2.283527
74.125.4.230     192.168.1.6    0.005583
74.125.99.106    192.168.1.6    1.023713
74.125.99.72     192.168.1.6    1.731761
91.81.217.140    192.168.1.6    6.745528
91.81.217.141    192.168.1.6    0.021178
Name: Length, dtype: float64
Source         Destination
74.125.99.169  192.168.1.6    0.602666
91.81.217.140  192.168.1.6    3.844219
91.81.217.141  192.168.1.6    0.012511
Name: Length, dtype: float64
Source          Destination
74.125.104.103  192.168.1.6     0.012088
74.125.111.106  192.168.1.6     0.015628
74.125.154.

In [202]:
dataset.head(10)

Unnamed: 0,Request_Size,Inter_RR_Time,DL_Time,DL_Vol,DL_Size,PB_Time
0,644.0,0.010329,0.192246,500058.0,333.0,4.152173
1,627.0,0.003737,0.49686,494034.0,329.0,6.173422
2,645.0,0.003889,0.049695,113040.0,76.0,8.249755
3,645.0,0.004516,2.248406,592749.0,396.0,10.257419
4,646.0,0.010285,0.985976,146733.0,99.0,13.26099
5,622.0,0.003852,0.204691,218132.0,146.0,14.348942
6,647.0,0.010206,0.191562,501604.0,334.0,16.269078
7,647.0,0.012687,0.025851,77958.0,53.0,18.279838
8,647.0,0.003656,0.039299,97964.0,66.0,20.957692
9,545.0,0.009862,0.191857,498576.0,332.0,21.845354


In [203]:
groundtruth.head(10)

Unnamed: 0,Next_Request_Time,Next_Response_Vol
0,2.01092,494034.0
1,2.072596,113040.0
2,2.003775,592749.0
3,2.999055,146733.0
4,1.077667,218132.0
5,1.916284,501604.0
6,2.000554,77958.0
7,2.665167,97964.0
8,0.884006,498576.0
9,1.781129,500074.0


**Regression:** I used **Random forest**, **Support Vector Regression** and a simple **Multi Layer Perceptron** for this task. To conclude, results obtaind by RF is closer to the Prof's result.
Note: I trained RF regressors for different number of estimators([100,150,200,250,300]) but results have not changed significantly.

In [214]:
# Define range of the number of estimator values to try
estimator_range = range(100, 301, 50)

# Initialize lists to store results
rf_time_rmse = []
rf_vol_rmse = []

kf = KFold(n_splits=10)

for train_idx, test_idx in kf.split(dataset):
    # Split data into training and test sets
    X_train, X_test = dataset.iloc[train_idx], dataset.iloc[test_idx]
    Y_time_train, Y_time_test = groundtruth['Next_Request_Time'].iloc[train_idx], groundtruth['Next_Request_Time'].iloc[test_idx]
    Y_vol_train, Y_vol_test = groundtruth['Next_Response_Vol'].iloc[train_idx], groundtruth['Next_Response_Vol'].iloc[test_idx]

    # Normalize datasets
    norm_train, norm_test = normalize_dataset(X_train, X_test)

    # Train random forest regressor for each estimator value
    time_errors = []
    vol_errors = []
    for n in estimator_range:
        rf_time = RandomForestRegressor(n_estimators=n, random_state=3).fit(norm_train, Y_time_train)
        rf_vol = RandomForestRegressor(n_estimators=n, random_state=3).fit(norm_train, Y_vol_train)

        # Make predictions
        pred_time = rf_time.predict(norm_test)
        pred_vol = rf_vol.predict(norm_test)

        # Collect results
        time_errors.append(sqrt(metrics.mean_squared_error(Y_time_test, pred_time)))
        vol_errors.append(sqrt(metrics.mean_squared_error(Y_vol_test, pred_vol)))

    # Store results for this fold
    rf_time_rmse.append(time_errors)
    rf_vol_rmse.append(vol_errors)


In [215]:
# Print the result
print("--------------------Random Forest---------------\n")
print("Estimation of the arrival time of next HTTP Request:\n")
print("RMSE = {} [s] (std = {})\n".format(np.mean(rf_time_rmse), np.std(rf_time_rmse)))
print("------------------------\n")
print("Prediction of the next burst size from the server:\n")
print("RMSE = {} [KB] (std = {})\n".format((np.mean(rf_vol_rmse))/1000, (np.std(rf_vol_rmse))/1000))

--------------------Random Forest---------------

Estimation of the arrival time of next HTTP Request:

RMSE = 5.823775997199808 [s] (std = 2.214806995876364)

------------------------

Prediction of the next burst size from the server:

RMSE = 382.1455748785304 [KB] (std = 117.57161316473793)



In [204]:
# to store RMSE(time and vol) for each fold
rf_time_rmse = []
rf_vol_rmse = []

kf = KFold(n_splits=10)

for train_idx, test_idx in kf.split(dataset):
    # Split data into training and test sets
    X_train, X_test = dataset.iloc[train_idx], dataset.iloc[test_idx]
    Y_time_train, Y_time_test = groundtruth['Next_Request_Time'].iloc[train_idx], groundtruth['Next_Request_Time'].iloc[test_idx]
    Y_vol_train, Y_vol_test = groundtruth['Next_Response_Vol'].iloc[train_idx], groundtruth['Next_Response_Vol'].iloc[test_idx]

    # Normalize datasets
    norm_train, norm_test = normalize_dataset(X_train, X_test)

    # Train random forest regressor
    rf_time = RandomForestRegressor(random_state=3).fit(norm_train, Y_time_train)
    rf_vol = RandomForestRegressor(random_state=3).fit(norm_train, Y_vol_train)

    # Make predictions
    pred_time = rf_time.predict(norm_test)
    pred_vol = rf_vol.predict(norm_test)

    # Collect results
    rf_time_rmse.append(sqrt(metrics.mean_squared_error(Y_time_test, pred_time)))
    rf_vol_rmse.append(sqrt(metrics.mean_squared_error(Y_vol_test, pred_vol)))

In [205]:
# Print the result
print("--------------------Random Forest---------------\n")
print("Estimation of the arrival time of next HTTP Request:\n")
print("RMSE = {} [s] (std = {})\n".format(np.mean(rf_time_rmse), np.std(rf_time_rmse)))
print("------------------------\n")
print("Prediction of the next burst size from the server:\n")
print("RMSE = {} [KB] (std = {})\n".format((np.mean(rf_vol_rmse))/1000, (np.std(rf_vol_rmse))/1000))

--------------------Random Forest---------------

Estimation of the arrival time of next HTTP Request:

RMSE = 5.80829323358744 [s] (std = 2.240064135066082)

------------------------

Prediction of the next burst size from the server:

RMSE = 381.4835256059127 [KB] (std = 118.5245632030567)



In [218]:
# to store RMSE(time and vol) for each fold
svr_time_rmse = []
svr_vol_rmse = []

kf = KFold(n_splits=10)

for train_idx, test_idx in kf.split(dataset):
    # Split data into training and test sets
    X_train, X_test = dataset.iloc[train_idx], dataset.iloc[test_idx]
    Y_time_train, Y_time_test = groundtruth['Next_Request_Time'].iloc[train_idx], groundtruth['Next_Request_Time'].iloc[test_idx]
    Y_vol_train, Y_vol_test = groundtruth['Next_Response_Vol'].iloc[train_idx], groundtruth['Next_Response_Vol'].iloc[test_idx]

    # Normalize datasets
    norm_train, norm_test = normalize_dataset(X_train, X_test)

    # Train SVR regressor
    svr_time = SVR(kernel='rbf', C=1, gamma='scale').fit(norm_train, Y_time_train)
    svr_vol = SVR(kernel='rbf', C=1, gamma='scale').fit(norm_train, Y_vol_train)

    # Make predictions
    pred_time = svr_time.predict(norm_test)
    pred_vol = svr_vol.predict(norm_test)

    # Collect results
    svr_time_rmse.append(sqrt(metrics.mean_squared_error(Y_time_test, pred_time)))
    svr_vol_rmse.append(sqrt(metrics.mean_squared_error(Y_vol_test, pred_vol)))


In [217]:
# Print the result
print("--------------------Support Vector Regression---------------\n")
print("Estimation of the arrival time of next HTTP Request:\n")
print("RMSE = {} [s] (std = {})\n".format(np.mean(svr_time_rmse), np.std(svr_time_rmse)))
print("------------------------\n")
print("Prediction of the next burst size from the server:\n")
print("RMSE = {} [KB] (std = {})\n".format((np.mean(svr_vol_rmse))/1000, (np.std(svr_vol_rmse))/1000))

--------------------Support Vector Regression---------------

Estimation of the arrival time of next HTTP Request:

RMSE = 5.943733174287085 [s] (std = 2.9495968680565476)

------------------------

Prediction of the next burst size from the server:

RMSE = 467.007124359698 [KB] (std = 232.1408659420704)



In [219]:
# to store RMSE(time and vol) for each fold
mlp_time_rmse = []
mlp_vol_rmse = []

kf = KFold(n_splits=10)

for train_idx, test_idx in kf.split(dataset):
    # Split data into training and test sets
    X_train, X_test = dataset.iloc[train_idx], dataset.iloc[test_idx]
    Y_time_train, Y_time_test = groundtruth['Next_Request_Time'].iloc[train_idx], groundtruth['Next_Request_Time'].iloc[test_idx]
    Y_vol_train, Y_vol_test = groundtruth['Next_Response_Vol'].iloc[train_idx], groundtruth['Next_Response_Vol'].iloc[test_idx]

    # Normalize datasets
    norm_train, norm_test = normalize_dataset(X_train, X_test)

    # Train MLP regressor
    mlp_time = MLPRegressor(hidden_layer_sizes=(100, 50), activation='tanh', solver='adam', random_state=3).fit(norm_train, Y_time_train)
    mlp_vol = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=3).fit(norm_train, Y_vol_train)

    # Make predictions
    pred_time = mlp_time.predict(norm_test)
    pred_vol = mlp_vol.predict(norm_test)

    # Collect results
    mlp_time_rmse.append(sqrt(metrics.mean_squared_error(Y_time_test, pred_time)))
    mlp_vol_rmse.append(sqrt(metrics.mean_squared_error(Y_vol_test, pred_vol)))


In [209]:
# Print the result
print("--------------------Multi Layer Perceptron---------------\n")
print("Estimation of the arrival time of next HTTP Request:\n")
print("RMSE = {} [s] (std = {})\n".format(np.mean(mlp_time_rmse), np.std(mlp_time_rmse)))
print("------------------------\n")
print("Prediction of the next burst size from the server:\n")
print("RMSE = {} [KB] (std = {})\n".format((np.mean(mlp_vol_rmse))/1000, (np.std(mlp_vol_rmse))/1000))

--------------------Multi Layer Perceptron---------------

Estimation of the arrival time of next HTTP Request:

RMSE = 6.42558141754733 [s] (std = 2.1822942631936737)

------------------------

Prediction of the next burst size from the server:

RMSE = 652.5453455380982 [KB] (std = 320.07008298107087)

