# Part 3: Cell Fingerprinting via Network Traffic Analysis

In [1]:
import numpy as np
import os
import pandas as pd
from itertools import combinations

## Collecting Features

In [2]:
"""
The following functions compute a statistic from a pcap dump, that might be used as a feature for the classifer
"""

# Compute the number of incoming packets
def incoming_packets(dump_df, src_ip):
    return dump_df[dump_df["ip.dst"] == src_ip].count()[0]

# Compute the number of outgoing packets
def outgoing_packets(dump_df, src_ip):
    return dump_df[dump_df["ip.src"] == src_ip].count()[0]

# Compute the total number of packets
def total_number_of_packets(dump_df):
    return dump_df.count()[0]

# Ratio of incoming packets vs. outgoing packets
def in_out_ratio(dump_df, src_ip):
    in_count = float(incoming_packets(dump_df, src_ip))
    out_count = outgoing_packets(dump_df, src_ip)
    return in_count / (out_count if out_count else 1)

# Total size of incoming packets
def incoming_size(dump_df, src_ip):
    return dump_df[dump_df["ip.dst"] == src_ip]["frame.len"].sum()

# Total size of outgoing packets
def outgoing_size(dump_df, src_ip):
    return dump_df[dump_df["ip.src"] == src_ip]["frame.len"].sum()

# Total size of packets
def total_size(dump_df):
    return dump_df["frame.len"].sum()

# Ratio of total incoming packet size vs total outgoing packet size
def size_ratio(dump_df, src_ip):
    in_size = float(incoming_size(dump_df, src_ip))
    out_size = outgoing_size(dump_df, src_ip)
    return in_size / (out_size if out_size else 1)

# Relative time of last packet
def relative_end_time(dump_df):
    return float(dump_df.iloc[-1]["frame.time_relative"])

# Mean and standard deviation of outgoing packet orderings
def ordering_statistics(dump_df, src_ip):
    outgoing_frame_numbers = dump_df[dump_df["ip.src"] == src_ip]["frame.number"].values
    return {"OutgoingOrderingMean": outgoing_frame_numbers.mean(), "OutgoingOrderingSTD": outgoing_frame_numbers.std()}

# Percentages of incoming and outgoing packets in a given time interval
def timing_info(dump_df, src_ip, n_intervals):
    end_time = float(dump_df.iloc[-1]["frame.time_relative"])
    time_intervals = np.linspace(0, end_time, n_intervals+1, endpoint=True)[1:]
    timing_info = dump_df["frame.time_relative"]\
                    .apply(lambda ts: next(idx for idx, interval_limit in enumerate(time_intervals) if ts <= interval_limit))
    dump_df["frame.timing_interval"] = timing_info
    in_stream = []
    out_stream = []
    in_size = 0
    out_size = 0
    for i in range(n_intervals):
        frames_in_interval = dump_df[dump_df["frame.timing_interval"] == i]
        in_size += frames_in_interval[frames_in_interval["ip.dst"] == src_ip]["frame.len"].sum()
        out_size += frames_in_interval[frames_in_interval["ip.src"] == src_ip]["frame.len"].sum()
        in_stream.append(float(in_size))
        out_stream.append(float(out_size))
    in_stream_features = {f"InStream{i+1}": 100 * (s / in_size) for i, s in enumerate(in_stream)}
    out_stream_features = {f"OutStream{i+1}": 100 * (s / out_size) for i, s in enumerate(out_stream)}
    return dict(in_stream_features, **out_stream_features)

In [3]:
# Compute statistics for a pcap dump, that will be used as features
def get_features(dump_df):
    src_ip = dump_df.iloc[0]["ip.src"]
    features = {}
    
    features["Duration"] = relative_end_time(dump_df)
    features["TotalPackets"] = total_number_of_packets(dump_df)
    features["IncomingPackets"] = incoming_packets(dump_df, src_ip)
    features["OutgoingPackets"] = outgoing_packets(dump_df, src_ip)
    features["InOutRatio"] = in_out_ratio(dump_df, src_ip)
    features["TotalSize"] = total_size(dump_df)
    features["IncomingSize"] = incoming_size(dump_df, src_ip)
    features["OutgoingSize"] = outgoing_size(dump_df, src_ip)
    features["SizeRatio"] = size_ratio(dump_df, src_ip)
    features.update(ordering_statistics(dump_df, src_ip))
    features.update(timing_info(dump_df, src_ip, 20))
    
    return features

In [5]:
csv_dump_folder = "csv"

"""
cell_dumps = [[os.path.join(csv_dump_folder, f"cellID{cell_id+1}", f) for f in os.listdir(os.path.join(csv_dump_folder, f"cellID{cell_id+1}"))\
                        if f[:4] == "dump"]\
                for cell_id in range(100)]

min_dumps = min([len(dump_files) for dump_files in cell_dumps])
cell_dumps = [dump_files[:min_dumps] for dump_files in cell_dumps]
"""

#cell_dumps = [[os.path.join(csv_dump_folder, f"cellID{cell_id}", f"dump{i}.csv") for i in range(1,101)] for cell_id in range(1,11)]
cell_dumps = [[os.path.join(csv_dump_folder, f"cellID{cell_id}", f"dump{i}.csv") for i in range(1,21)] for cell_id in range(1,101)]

feature_set = []
labels = []
for cell_id, dumps in enumerate(cell_dumps):
    for dump_file in dumps:
        dump_df = pd.read_csv(dump_file, names=["frame.number", "frame.time_relative", "ip.src" ,"ip.dst", "frame.protocols", "frame.len"])
        if dump_df.count()[0]:
            features = get_features(dump_df)
            feature_set.append(features)
            labels.append(cell_id)
            
X = pd.DataFrame(feature_set)
y = np.array(labels)

In [6]:
X.head()

Unnamed: 0,Duration,TotalPackets,IncomingPackets,OutgoingPackets,InOutRatio,TotalSize,IncomingSize,OutgoingSize,SizeRatio,OutgoingOrderingMean,...,OutStream11,OutStream12,OutStream13,OutStream14,OutStream15,OutStream16,OutStream17,OutStream18,OutStream19,OutStream20
0,9.051183,221,175,46,3.804348,378195,348485,29710,11.729552,239.217391,...,58.29687,66.240323,68.226186,70.21205,72.197913,74.183777,80.141367,92.056547,94.04241,100.0
1,5.99478,174,136,38,3.578947,349012,324536,24476,13.259356,191.631579,...,61.431606,63.842131,75.894754,75.894754,78.305279,80.715803,83.126328,85.536852,87.947377,100.0
2,7.128145,250,202,48,4.208333,425902,396040,29862,13.26234,257.333333,...,52.581877,56.533387,64.436407,70.363673,72.339428,76.290938,84.193959,86.169714,94.072735,100.0
3,8.01034,208,166,42,3.952381,381302,354466,26836,13.2086,225.357143,...,53.830675,60.426293,69.22045,71.418989,75.816068,78.014607,80.213147,82.411686,91.205843,100.0
4,9.010757,214,175,39,4.487179,357960,331866,26094,12.718096,238.717949,...,50.548019,57.331187,59.592243,68.636468,73.158581,75.419637,79.941749,84.463861,88.985974,100.0


In [7]:
y[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:
np.unique(y, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([20, 20, 20, 20, 20, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 20, 20, 20]))

## Hyperparameter Tuning

In [9]:
from sklearn import metrics, preprocessing, svm
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [10]:
all_columns = X.columns.tolist()
columns = all_columns[:10]
stream_ratios = all_columns[-40:]
print(columns)
print(stream_ratios)

['Duration', 'TotalPackets', 'IncomingPackets', 'OutgoingPackets', 'InOutRatio', 'TotalSize', 'IncomingSize', 'OutgoingSize', 'SizeRatio', 'OutgoingOrderingMean']
['InStream1', 'InStream2', 'InStream3', 'InStream4', 'InStream5', 'InStream6', 'InStream7', 'InStream8', 'InStream9', 'InStream10', 'InStream11', 'InStream12', 'InStream13', 'InStream14', 'InStream15', 'InStream16', 'InStream17', 'InStream18', 'InStream19', 'InStream20', 'OutStream1', 'OutStream2', 'OutStream3', 'OutStream4', 'OutStream5', 'OutStream6', 'OutStream7', 'OutStream8', 'OutStream9', 'OutStream10', 'OutStream11', 'OutStream12', 'OutStream13', 'OutStream14', 'OutStream15', 'OutStream16', 'OutStream17', 'OutStream18', 'OutStream19', 'OutStream20']


In [14]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
scaler = preprocessing.StandardScaler().fit(X_train_raw)
X_train_scaled = pd.DataFrame(scaler.transform(X_train_raw), columns=all_columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_raw), columns=all_columns)

In [17]:
feature_scores = []

for k in range(1, 10):
    print(f"======== {k}-NN ========")
    
    #split dataset into train and test data
    features = stream_ratios
    X_train = X_train_scaled[features]
    X_test = X_test_scaled[features]
    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=k)
    # Fit the classifier to the data
    knn.fit(X_train,y_train)
    #check accuracy of our model on the test data
    feature_score = knn.score(X_test, y_test)
    feature_scores.append((feature_score, 0, True, []))
    
    for n_features in range(1, len(columns)+1):
        print(f"Classification using {n_features} features")
        for feature_t in combinations(columns, n_features):
            #split dataset into train and test data
            features = list(feature_t)
            X_train = X_train_scaled[features]
            X_test = X_test_scaled[features]
            # Create KNN classifier
            knn1 = KNeighborsClassifier(n_neighbors=k)
            # Fit the classifier to the data
            knn1.fit(X_train,y_train)
            #check accuracy of our model on the test data
            feature_score = knn1.score(X_test, y_test)
            feature_scores.append((feature_score, k, False, features))
            
            #split dataset into train and test data
            features_extended = features + stream_ratios
            X_train = X_train_scaled[features_extended]
            X_test = X_test_scaled[features_extended]
            # Create KNN classifier
            knn2 = KNeighborsClassifier(n_neighbors=k)
            # Fit the classifier to the data
            knn2.fit(X_train,y_train)
            #check accuracy of our model on the test data
            feature_score = knn2.score(X_test, y_test)
            feature_scores.append((feature_score, k, True, features))
    print('\n')   
    print("Current max:", max(feature_scores)[0], '\n')
    print('\n')

Classification using 1 features
Classification using 2 features
Classification using 3 features
Classification using 4 features
Classification using 5 features
Classification using 6 features
Classification using 7 features
Classification using 8 features
Classification using 9 features
Classification using 10 features


Current max: 0.945 



Classification using 1 features
Classification using 2 features
Classification using 3 features
Classification using 4 features
Classification using 5 features
Classification using 6 features
Classification using 7 features
Classification using 8 features
Classification using 9 features
Classification using 10 features


Current max: 0.945 



Classification using 1 features
Classification using 2 features
Classification using 3 features
Classification using 4 features
Classification using 5 features
Classification using 6 features
Classification using 7 features
Classification using 8 features
Classification using 9 features
Classification using

In [18]:
print(max(feature_scores))

(0.945, 1, True, ['TotalPackets', 'OutgoingPackets', 'TotalSize', 'IncomingSize', 'OutgoingSize', 'SizeRatio', 'OutgoingOrderingMean'])


## K-NN Classification

In [11]:
features_extended = ['TotalPackets', 'OutgoingPackets', 'TotalSize', 'IncomingSize', 'OutgoingSize', 'SizeRatio', 'OutgoingOrderingMean'] + stream_ratios
X_selected = X[features_extended]
scaler = preprocessing.StandardScaler().fit(X_selected)
X_scaled = pd.DataFrame(scaler.transform(X_selected), columns=features_extended)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=1, stratify=y)

In [12]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors=1)
# Fit the classifier to the data
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [13]:
#check accuracy of our model on the test data
knn.score(X_test, y_test)

0.63

## SVM Classification

In [14]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [15]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.72


In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.75      0.67         4
           1       0.80      1.00      0.89         4
           2       0.67      0.50      0.57         4
           3       0.67      0.50      0.57         4
           4       0.67      1.00      0.80         4
           5       0.75      0.75      0.75         4
           6       0.40      0.50      0.44         4
           7       0.50      0.50      0.50         4
           8       0.50      1.00      0.67         4
           9       0.67      1.00      0.80         4
          10       0.50      0.50      0.50         4
          11       0.67      1.00      0.80         4
          12       0.50      0.75      0.60         4
          13       1.00      0.75      0.86         4
          14       1.00      0.75      0.86         4
          15       1.00      0.75      0.86         4
          16       1.00      1.00      1.00         4
          17       0.80    

In [17]:
clf_cv = svm.SVC(kernel='linear')

#train model with cv of 10
cv_scores = cross_val_score(clf_cv, X_scaled, y, cv=10)

#print each cv score (accuracy) and average them
print(cv_scores)
print("cv_scores mean:{}".format(np.mean(cv_scores)))

[0.645      0.71       0.775      0.755      0.685      0.805
 0.69       0.60301508 0.74371859 0.7638191 ]
cv_scores mean:0.7175552763819095
