In [None]:
# Headers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import ast
import random
random.seed(10)

In [None]:
# Imports the dataset from a csv file
dataset = pd.read_csv("train.csv")

In [None]:
# Plots some of the dataset's information
dataset.describe()

In [None]:
# Drops unwanted columns
dataset = dataset.drop(columns=['tr_attempts'])
dataset = dataset.drop(columns=['tr_id'])

In [None]:
# Removes observations with empty data
dataset.dropna(inplace=True)

In [None]:
# Cuts the dataset to a fraction of its size
def cut_dataset(dataset, factor=1):
	dataset_temp = dataset.drop_duplicates(keep='first')
	return dataset_temp.sample(frac=factor)

In [None]:
# Balances the dataset by cutting the larger class
def balance_dataset(dataset, target_name, factor=1.0):
	dataset_temp = dataset.drop_duplicates(keep='first')
	class_0 = dataset_temp[dataset_temp[target_name] == 0]
	class_1 = dataset_temp[dataset_temp[target_name] == 1]
	
	if len(class_0) > len(class_1):
		class_1 = class_1.sample(int(len(class_1) * factor))
		class_0 = class_0.sample(len(class_1))
	else:
		class_0 = class_0.sample(int(len(class_0) * factor))
		class_1 = class_1.sample(len(class_0))

	return pd.concat([class_0, class_1])

In [None]:
# Cuts pairs
def cut_pairs(dataset, factor=1.0):
    pairs = dataset[['tr_src', 'tr_dst']].drop_duplicates()
    sampled_pairs = pairs.sample(frac=factor)
    display(len(sampled_pairs))
    filtered_dataset = dataset.merge(sampled_pairs, on=['tr_src', 'tr_dst'], how='inner')
    return filtered_dataset

In [None]:
# Trims excess data pairwise
def trim_excess_data_pairwise(dataset, k=1):
	processed_dataset = pd.DataFrame({})
	pairs = dataset[['tr_src', 'tr_dst']].drop_duplicates()

	for _, pair in pairs.iterrows():
		pair_samples = dataset.loc[(dataset['tr_src'] == pair['tr_src']) & (dataset['tr_dst'] == pair['tr_dst'])]
		pair_samples = pair_samples.sort_values(by='seconds_since_start')
		n = len(pair_samples)
		if n < k:
			processed_dataset = pd.concat([processed_dataset, pair_samples])
			continue
		# should we get the first sample from the start?
		offset = random.randint(0, n - k)
		processed_dataset = pd.concat([processed_dataset, pair_samples.iloc[offset:k - 1 + offset]])

	return processed_dataset

In [None]:
# balanced_dataset = balance_dataset(dataset, 'route_changed', factor=0.001)
# balanced_dataset = cut_dataset(dataset, factor=0.001)
# balanced_dataset = cut_pairs(dataset, 0.005)
balanced_dataset = trim_excess_data_pairwise(dataset, k=1000)
balanced_dataset = balance_dataset(balanced_dataset, 'route_changed', factor=1.0)

In [None]:
balanced_dataset.describe()

In [None]:
# Parses the all_rtts column from string to list
def parse_rtts(stringzinha):
    try:
        return ast.literal_eval(stringzinha)
    except:
        return []

# Calculates mean and std of all_rtts
rtt_lists = balanced_dataset["all_rtts"].apply(parse_rtts)
balanced_dataset["mean_rtt"] = rtt_lists.apply(lambda x: np.mean(x) if len(x) > 0 else 0.0)
balanced_dataset["std_rtt"] = rtt_lists.apply(
	lambda x: np.std(x, ddof=1) if len(x) > 1 else 0.0
)

# Drops the all_rtts column
balanced_dataset = balanced_dataset.drop(columns=['all_rtts'])

In [None]:
balanced_dataset['relative_dropped_probes'] = (balanced_dataset['total_probes_sent'] - balanced_dataset['total_replies_last_hop']) / balanced_dataset['total_probes_sent']

balanced_dataset = balanced_dataset.drop(columns=['total_replies_last_hop', 'total_probes_sent'])

In [None]:
balanced_dataset.describe()

In [None]:
# Contextualize the dataset with a sliding window
def contextualize_sliding_window(dataset, window_size=2):

    df = dataset.copy()
    
    for i in range(1, window_size):
        # df[f'last_mean_{i}'] = df['mean_rtt'].shift(i)
        # df[f'last_std_{i}'] = df['std_rtt'].shift(i)
        df[f'diff_sq_mean_{i}'] = (df['mean_rtt'] - df['mean_rtt'].shift(i))**2
        df[f'diff_sq_std_{i}'] = (df['std_rtt'] - df['std_rtt'].shift(i))**2

    return df.dropna()

In [None]:
balanced_dataset = contextualize_sliding_window(balanced_dataset, window_size=2)

In [None]:
# Pearson correlation
cor_pearson = balanced_dataset.corr(method='pearson')
plt.figure(figsize=(8, 8))
sbn.heatmap(cor_pearson, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Spearman correlation
cor_spearman = balanced_dataset.corr('spearman')
plt.figure(figsize=(8, 8))
sbn.heatmap(cor_spearman, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Drops some columns
balanced_dataset = balanced_dataset.drop(columns=['date_index',           # Check later if we should keep it :)
                          						  'seconds_since_start',  # the same as above
												  'relative_dropped_probes'])

In [None]:
# Exports the treated dataset to a csv file
balanced_dataset.to_csv("treated_train.csv", index=False)