#**Homework**

Complete the following tasks:

* Use a dataset of 21 Video Sessions
* Recognize the Video Server(s) IP and select video traffic (***if more than one Server is found, keep the dominant flow only***)
* Detect Video Client HTTP Requests (Uplink packets with size larger or equal to 100 Bytes)
* Compute features to predict:
 1.   When the next UL Request is sent by the Video Client 
 2.   How large is the response of the Server to the next UL Request

**N.B.**: Below, you can find a list of useful functions for the tasks at hand (introduced during class).

In [15]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

### Functions Ready-To-Use

In [2]:
def filter_traffic(data, domain):
   
    # Look in DNS Responses for googlevideo domain
    dns_data = data[data['Protocol']=='DNS']
    dns = dns_data[dns_data['Info'].apply(lambda x: 'googlevideo' in x and 'response' in x)]
    ips = dns.Address.values 
    server_names = dns.Name.values
    
    # Filtering on either "Source" or "Destination" IP, get the 
    # rows of the dataset that contain at least one of the selected IPs
    downlink = data[data['Source'].apply(lambda x: x in ips)].dropna(subset=['Length']) 

    uplink = data[data['Destination'].apply(lambda x: x in ips)].dropna(subset=['Length'])
    
    return ips, server_names, uplink, downlink

def find_dominant(uplink, downlink):

  # Expressed in MB

  # Order flows by cumulative DL Volume
  flows_DL = downlink.groupby(['Source','Destination'])['Length'].sum()/(10**6)
  # print(flows_DL)
  
  # Get (Source,Destination) IPs of dominant flows
  dom_id = flows_DL[flows_DL==max(flows_DL)].index[0]
  # ('91.81.217.140', '192.168.1.6')
  # Edited this
  # Filter traffic selecting the dominant flow
  dom_dl = downlink[downlink['Source']==dom_id[0]]
  dom_ul = uplink[(uplink['Destination']==dom_id[0])]

  return dom_ul, dom_dl

def timebased_filter(data, length=None, min_time=None, max_time=None):
  '''
  :param data: pd dataframe to be filtered. Must contain columns: "Length" and "Time"
  :param length: all packets shorter than length [Bytes] will be discarded (default 0)
  :param min_time: all packets with timestamp smaller than min_time [s] will be discarded (default 0)
  :param max_time: all packets with timestamp larger than max_time [s] will be discarded (default 1000)
  '''

  if length is None:
    length=0
  if min_time is None:
    min_time = 0
  if max_time is None:
    max_time = 1000
  
  filtered_data = data.copy().reset_index()
  mask = (filtered_data['Length']>=length) & (filtered_data['Time']>=min_time) & (filtered_data['Time']<= max_time)
  filtered_data = filtered_data.loc[mask[mask ==True].index]

  return filtered_data

def find_next(array, value):
  '''
  :param array: np.array, array of floats
  :param value: float, reference value
  :return: position of the closest element of the array greater than "value"
  '''
  delta = np.asarray(array) - value
  idx = np.where(delta >= 0, delta, np.inf).argmin()

  return idx

def normalize_dataset(training_set, test_set):

  mean_train = training_set.mean()
  std_train = training_set.std()
  norm_train = (training_set - mean_train)/std_train
  norm_test = (test_set - mean_train)/std_train  

  return norm_train, norm_test

### Functions to be completed


In [138]:
def features_extraction(uplink, downlink):
  '''
  Complete this function to extract both features and groundtruth.

  NB: The features extraction process is the same as the one introduced during
  the lecture. 
  '''
  dataset = pd.DataFrame(columns=['Request_Size','Inter_RR_Time','DL_Time','DL_Vol','DL_Size','PB_Time'])
  # ****************************************************************************
  # Feature 1: Client Request Size
  dataset['Request_Size'] = list(uplink.Length.values)

  # ****************************************************************************
  # Feature 2: Inter Request-Response Time
  rr_time = []
  response_time = []
  for t in uplink.Time:
    response_time.append(find_next(downlink.Time, t)) #index of next DL packet timestamp 
    rr_time.append(downlink.Time.iloc[response_time[-1]] - t)

  dataset['Inter_RR_Time'] = rr_time
  # ****************************************************************************
  # Feature 3-4-5: Download Time, Download Volume, Download Size (# Packets) 
  dt = []
  dv = []
  ds = []

  for rt1, rt2 in zip(response_time[:-1], response_time[1:]):
    #Download Time
    dt.append(downlink.Time.iloc[rt2-1] - downlink.Time.iloc[rt1])

    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt1], downlink.Time.iloc[rt2-1])
    
    #Download Volume
    dv.append(temp.Length.sum())

    #Download Size (# Packets) 
    ds.append(temp.shape[0])

  # Last Iteration data might be corrupted due to drastic interruption of capture 
  # process. If it is so, an error would occur during the features extraction.
  # To avoid this, we skip last HTTP iteration data when an error is raised 
  # using the try-except logic below.
  try:
    # Consider also last HTTP iteration
    #Download Time
    dt.append(downlink.Time.iloc[-1] - downlink.Time.iloc[rt2])

    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt2], downlink.Time.iloc[-1])
    #Download Volume
    dv.append(temp.Length.sum())

    #Download Size (# Packets) 
    ds.append(temp.shape[0])
  except:
    print("exception")

  dataset['DL_Time'] = dt
  dataset['DL_Vol'] = dv
  dataset['DL_Size'] = ds
  

  # ****************************************************************************
  # Feature 5: Playback Time
  pbt = list(uplink.Time.values)
  dataset['PB_Time'] = pbt
  # ****************************************************************************
  
  
  # Check Features Consistency
  #dataset = dataset[(dataset > 0).all(1)]
  #dataset = dataset[dataset['DL_Time']<20]

  ###############################################################
  # TO BE COMPLETED

  ### EXTRACT GROUNDTRUTH HERE
  groundtruth = pd.DataFrame(columns=['Next_Request_Time','Next_Response_Vol'])
  # ****************************************************************************
  # GT 1: Next Request Time

  # Filtering uplink requests which are more than 100bytes
  uplink_reqs = uplink.copy().reset_index()
  next_req_list = [None]
  next_resp_vol = [None]
  # We start from the 2nd uplink
  for index, uplink_req in uplink_reqs.iloc[1:-1].iterrows():
    # Finding the inter-request time 
    sub_dl = downlink[downlink["Time"] < uplink_req["Time"]]
    # find the row with the closest value to x in the Time column
    closest_row = sub_dl.iloc[(sub_dl['Time'] - uplink_req["Time"]).abs().argsort()[:1]]
    if not len(closest_row):
      next_req_list.append(None)
      next_resp_vol.append(None)
      continue
    delta = uplink_req.Time - closest_row.Time
    next_req_list.append(delta.iloc[0])

    # Calculating the Volume transfered in this timespan
    sub_dl = downlink[
                (downlink["Time"] < uplink_reqs.iloc[index + 1].Time) &
                (downlink["Time"] > uplink_req.Time)
    ]
    next_resp_vol.append(sub_dl.Length.sum())
  

  next_req_list.append(None)
  next_resp_vol.append(None)

  #next_req_list.append(None)
  #next_resp_vol.append(None)
  
  # print(next_req_list[0])
  #groundtruth['Next_Request_Time'] = next_req_list
  dataset['Next_Request_Time'] = next_req_list 
  # ****************************************************************************
  # GT 2: Next Response Volume
  
  #groundtruth['Next_Response_Vol'] = next_resp_vol
  dataset['Next_Response_Vol'] = next_resp_vol
  ###############################################################


  # I have made a little different approach with the proposed way.
  # Check Features Consistency
  dataset = dataset[(dataset > 0).all(1)]
  dataset = dataset[dataset['DL_Time']<20]
  dataset = dataset[dataset["Next_Response_Vol"] > 0]
  dataset['Next_Response_Vol'] = dataset['Next_Response_Vol'].shift(-1)
  dataset.dropna(inplace=True)

  groundtruth = dataset.loc[:, ["Next_Request_Time", "Next_Response_Vol"]]
  dataset = dataset.loc[:, ["Request_Size", "Inter_RR_Time", "DL_Time", "DL_Vol", "DL_Size", "PB_Time"]]
  

  return dataset, groundtruth

### Write your code here



#### First approach

In This method we are using the provided functions to do the work. 

In [141]:
joint_dataset = pd.DataFrame()
joint_gt = pd.DataFrame()
for i in range(22):
  # Iterating through files
  filename = f"Capture_v2_{i}.csv"
  df = pd.read_csv(filename)
  ips, server_names, uplink, downlink = filter_traffic(df, "domain")
  dom_ul, dom_dl = find_dominant(uplink, downlink)
  # Filtering the uplink and downlinks
  # 100 bytes for uplink that comes from the paper
  # 50 bytes for downlink that comes from the class discussion
  dom_ul = timebased_filter(dom_ul, 100, 2, 180)
  dom_dl = timebased_filter(dom_dl, 50, 2, 180)
  dataset, groundtruth = features_extraction(dom_ul, dom_dl)
  joint_dataset = pd.concat([joint_dataset, dataset])
  joint_gt = pd.concat([joint_gt, groundtruth])
  print(f"i: {i}, dataset_shape: {dataset.shape[0]}")
  

i: 0, dataset_shape: 45
i: 1, dataset_shape: 40
i: 2, dataset_shape: 35
exception
i: 3, dataset_shape: 0
i: 4, dataset_shape: 23
i: 5, dataset_shape: 7
i: 6, dataset_shape: 5
i: 7, dataset_shape: 74
i: 8, dataset_shape: 16
i: 9, dataset_shape: 4
i: 10, dataset_shape: 3
i: 11, dataset_shape: 3
i: 12, dataset_shape: 7
i: 13, dataset_shape: 3
i: 14, dataset_shape: 10
i: 15, dataset_shape: 0
i: 16, dataset_shape: 10
i: 17, dataset_shape: 4
i: 18, dataset_shape: 6
i: 19, dataset_shape: 0
i: 20, dataset_shape: 5
i: 21, dataset_shape: 5


As we see there are some files that the dominant flow and ip wasn't identified properly. in order to tackle this issue we do something in the next part. 


In [142]:
dataset = joint_dataset
dataset

Unnamed: 0,Request_Size,Inter_RR_Time,DL_Time,DL_Vol,DL_Size,PB_Time
1,583.0,0.004833,0.005944,10602.0,10.0,16.448505
5,1034.0,0.017107,0.000008,1592.0,6.0,16.474616
7,654.0,0.009883,0.002476,23100.0,17.0,19.934595
11,562.0,0.002912,0.055167,70024.0,49.0,24.182383
13,565.0,0.003601,0.004896,69926.0,48.0,24.251007
...,...,...,...,...,...,...
1,583.0,0.004400,0.003151,11018.0,10.0,89.967206
7,594.0,0.004044,0.054542,141656.0,100.0,89.976745
9,602.0,0.004542,0.115750,71872.0,50.0,90.044470
11,604.0,0.011249,0.012611,435428.0,298.0,97.324990


In [143]:
groundtruth = joint_gt 
groundtruth

Unnamed: 0,Next_Request_Time,Next_Response_Vol
1,0.000999,1592.0
5,0.015334,23100.0
7,3.442864,70024.0
11,4.235429,69926.0
13,0.010545,145450.0
...,...,...
1,0.000181,141656.0
7,0.001988,71872.0
9,0.009139,435428.0
11,7.160228,513678.0


Since we are working with random forest and it is a tree based model, there is little difference in using normalization for our data. 

In [144]:
def random_forest_regressor(X_train, X_test, y_train, y_test):
  # Define the random forest regressor model
  rf = RandomForestRegressor()

  # Train the model on the training set
  rf.fit(X_train, y_train)

  # Use the trained model to make predictions on the test set
  y_pred = rf.predict(X_test)

  # Calculate the RMSE for this fold and add it to the list
  return np.sqrt(mean_squared_error(y_test, y_pred))


In [145]:
def kfold_test(dataset, groundtruth, n_folds):
  kf = KFold(n_splits=n_folds)

  # Initialize a list to store the RMSE scores for each fold
  rmse_scores = []

  # Iterate over the folds and train/test the model
  for train_index, test_index in kf.split(X):
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Calculate the RMSE for this fold and add it to the list
    rmse_scores.append(random_forest_regressor(X_train, X_test, y_train, y_test))

  # Calculate the overall RMSE across all folds
  mean_rmse = np.mean(rmse_scores)

  return mean_rmse

In [146]:
# Extracting X and y
X = dataset.values
y = groundtruth["Next_Request_Time"].values.ravel()

mean_rmse = kfold_test(dataset, groundtruth, 5)

print('RMSE(in seconds):', mean_rmse)

RMSE(in seconds): 5.0966306038378715


In [147]:
# Load your data into X and y
X = dataset.values
y = groundtruth["Next_Response_Vol"].values.ravel()

mean_rmse = kfold_test(dataset, groundtruth, 5)

# Since it's in kB we divide it by 1000
print('RMSE(in kB):', mean_rmse / 1000)

RMSE(in kB): 388.672138803499


#### Second Approach 
With the results we got from the last part we know that we add all of the DNS responses into a list and try to solve the problem of caching dnses.

In [148]:
google_videos_ips = []
for i in range(22):
  # Iterating through files
  filename = f"Capture_v2_{i}.csv"
  data = pd.read_csv(filename)
  dns_data = data[data['Protocol']=='DNS']
  dns = dns_data[dns_data['Info'].apply(lambda x: 'googlevideo' in x and 'response' in x)]
  ips = dns.Address.values
  google_videos_ips += [ip for ip in ips]
google_videos_ips = list(set(google_videos_ips))


In [149]:
google_videos_ips = list(set(google_videos_ips))
google_videos_ips

['173.194.187.136',
 '74.125.99.108',
 '74.125.163.138',
 '173.194.160.219',
 '74.125.111.106',
 '74.125.162.40',
 '74.125.162.39',
 '74.125.4.230',
 '173.194.182.230',
 '74.125.153.7',
 '74.125.160.38',
 '91.81.217.141',
 '74.125.99.170',
 '173.194.188.136',
 '74.125.99.91',
 '91.81.217.140',
 '173.194.187.71',
 '74.125.99.137',
 '74.125.99.72',
 '173.194.188.72',
 '74.125.99.168',
 '74.125.99.106',
 '74.125.99.105',
 '74.125.160.202',
 '74.125.99.166',
 '74.125.111.102',
 '74.125.153.59',
 '74.125.111.105',
 '173.194.188.105',
 '74.125.105.10',
 '173.194.188.230',
 '209.85.226.38',
 '173.194.160.200',
 '74.125.99.169',
 '173.194.182.138',
 '74.125.154.138',
 '173.194.182.135',
 '74.125.104.103',
 '74.125.110.102',
 '172.217.132.137',
 '74.125.153.11',
 '74.125.153.24']

In [150]:
def filter_traffic_v2(data, ips):
    # Filtering on either "Source" or "Destination" IP, get the 
    # rows of the dataset that contain at least one of the selected IPs
    downlink = data[data['Source'].apply(lambda x: x in ips)].dropna(subset=['Length']) 

    uplink = data[data['Destination'].apply(lambda x: x in ips)].dropna(subset=['Length'])
    
    return uplink, downlink

In [151]:
joint_df = pd.DataFrame()
for i in range(22):
  # Iterating through files
  filename = f"Capture_v2_{i}.csv"
  df = pd.read_csv(filename)
  uplink, downlink = filter_traffic_v2(df, google_videos_ips)
  dom_ul, dom_dl = find_dominant(uplink, downlink)
  # Filtering the uplink and downlinks
  # 100 bytes for uplink that comes from the paper
  # 50 bytes for downlink that comes from the class discussion
  dom_ul = timebased_filter(dom_ul, 100, 2, 180)
  dom_dl = timebased_filter(dom_dl, 50, 2, 180)
  dataset, groundtruth = features_extraction(dom_ul, dom_dl)
  joint_dataset = pd.concat([joint_dataset, dataset])
  joint_gt = pd.concat([joint_gt, groundtruth])
  print(f"i: {i}, dataset_shape: {dataset.shape[0]}")

i: 0, dataset_shape: 45
i: 1, dataset_shape: 32
i: 2, dataset_shape: 35
i: 3, dataset_shape: 51
i: 4, dataset_shape: 55
i: 5, dataset_shape: 58
i: 6, dataset_shape: 82
i: 7, dataset_shape: 74
i: 8, dataset_shape: 23
i: 9, dataset_shape: 37
i: 10, dataset_shape: 40
i: 11, dataset_shape: 42
i: 12, dataset_shape: 30
i: 13, dataset_shape: 11
i: 14, dataset_shape: 11
i: 15, dataset_shape: 22
i: 16, dataset_shape: 10
i: 17, dataset_shape: 21
i: 18, dataset_shape: 30
i: 19, dataset_shape: 32
i: 20, dataset_shape: 30
i: 21, dataset_shape: 34


In [152]:
# Extracting X and y
X = dataset.values
y = groundtruth["Next_Request_Time"].values.ravel()

mean_rmse = kfold_test(dataset, groundtruth, 5)

print('RMSE(in seconds):', mean_rmse)

RMSE(in seconds): 3.3952324769473234


In [153]:
# Load your data into X and y
X = dataset.values
y = groundtruth["Next_Response_Vol"].values.ravel()

mean_rmse = kfold_test(dataset, groundtruth, 5)

# Since it's in kB we divide it by 1000
print('RMSE(in kB):', mean_rmse / 1000)

RMSE(in kB): 267.45013416469317


We clearly see that we are finding way better results this way! 

Hessam Hashemizadeh