In [1]:
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset folders
# video_size_2019-01-26T11_2019-01-27T11
dataset_list = ['video_sent_2019-01-26T11_2019-01-27T11']
# THIS BUFFER LIST IS NEEDED PAST 2021-06-12T11_2021-06-13T11
ssim_list = ['ssim_2019-01-26T11_2019-01-27T11']
buffer_list = ['client_buffer_2019-01-26T11_2019-01-27T11']

current_dir = os.getcwd()
base_path = '' + current_dir + '/puffer_tests/'
print(base_path)
# base_path = './test_data'

# Collect all merged CSVs
all_dfs = []
print("Loading CSV files...")
for dataset in tqdm(dataset_list, desc="Datasets"):
    file_path = os.path.join(base_path + dataset + '.csv')  # <-- match all CSVs
    
    try:
        df = pd.read_csv(file_path, dtype=str)
        print(f"Loaded {file_path} with shape {df.shape}")
        df['source_dataset'] = dataset  # Optional: keep track of source
        all_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

all_buffer_dfs = []
for client_buffer_dataset in tqdm(buffer_list, desc="Datasets"):
    file_path = os.path.join(base_path + client_buffer_dataset + '.csv')  # <-- match all CSVs
    
    try:
        df = pd.read_csv(file_path, dtype=str)
        print(f"Loaded {file_path} with shape {df.shape}")
        df['source_dataset'] = dataset  # Optional: keep track of source
        all_buffer_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
# # Combine all data

/Users/stevenjiang/Documents/GitHub/293N-YT-ABR-Prediction/puffer_tests/
Loading CSV files...


Datasets: 100%|██████████| 1/1 [00:00<00:00, 114.88it/s]


Loaded /Users/stevenjiang/Documents/GitHub/293N-YT-ABR-Prediction/puffer_tests/video_sent_2019-01-26T11_2019-01-27T11.csv with shape (49, 14)


Datasets: 100%|██████████| 1/1 [00:00<00:00, 571.82it/s]

Loaded /Users/stevenjiang/Documents/GitHub/293N-YT-ABR-Prediction/puffer_tests/client_buffer_2019-01-26T11_2019-01-27T11.csv with shape (49, 8)





In [2]:
print(all_buffer_dfs)

[          time (ns GMT)                                    session_id index  \
0   1548500400094000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
1   1548500400345000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
2   1548500400602000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
3   1548500400848000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
4   1548500401105000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
5   1548500401349000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
6   1548500401595000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
7   1548500401841000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
8   1548500401993000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=     0   
9   1548500402096000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
10  1548500402345000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=     0   
11  1548500402601000000  kaXlFjXCNjAgx4nsvYsDetENDz

In [28]:
# Combine all data
print("Combining CSVs into one DataFrame...")
data = pd.concat(all_dfs, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

Combining CSVs into one DataFrame...
Total rows loaded: 1235841


In [6]:
print("Combining CSVs into one DataFrame...")
data = pd.concat(all_dfs, ignore_index=True)
buffer_data = pd.concat(all_buffer_dfs, ignore_index=True)
print(f"Total rows loaded: {len(buffer_data)}")

Combining CSVs into one DataFrame...
Total rows loaded: 49


In [7]:
print(data.columns)
print(buffer_data.columns)
# Convert both to integers if they aren't already
# data['time (ns GMT)'] = pd.to_numeric(data['time (ns GMT)'], errors='raise')
# buffer_data['time (ns GMT)'] = pd.to_numeric(buffer_data['time (ns GMT)'], errors='raise')
# # Drop rows with null time values (from coercion or original data)
# data = data.dropna(subset=['time (ns GMT)', 'session_id'])
# buffer_data = buffer_data.dropna(subset=['time (ns GMT)', 'session_id'])

Index(['time (ns GMT)', 'session_id', 'index', 'expt_id', 'channel',
       'video_ts', 'format', 'size', 'ssim_index', 'cwnd', 'in_flight',
       'min_rtt', 'rtt', 'delivery_rate', 'source_dataset'],
      dtype='object')
Index(['time (ns GMT)', 'session_id', 'index', 'expt_id', 'channel', 'event',
       'buffer', 'cum_rebuf', 'source_dataset'],
      dtype='object')


In [8]:
# Ensure the time column is numeric
data['time (ns GMT)'] = pd.to_numeric(data['time (ns GMT)'], errors='coerce')
buffer_data['time (ns GMT)'] = pd.to_numeric(buffer_data['time (ns GMT)'], errors='coerce')

# Drop rows with null time values (from coercion or original data)
data = data.dropna(subset=['time (ns GMT)', 'session_id'])
buffer_data = buffer_data.dropna(subset=['time (ns GMT)', 'session_id'])

# Sort both dataframes properly
data_sorted = data.sort_values(['time (ns GMT)', 'session_id']).reset_index(drop=True)
buffer_sorted = buffer_data.sort_values(['time (ns GMT)', 'session_id']).reset_index(drop=True)
# Merge asof (requires full sort on the merge key)
merged = pd.merge_asof(
    data_sorted,
    buffer_sorted,
    on='time (ns GMT)',
    by='session_id',
    direction='backward',
    tolerance=10**9 * 10  # 10 seconds
)


In [9]:
print(merged)

          time (ns GMT)                                    session_id index_x  \
0   1548500400788000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=       0   
1   1548500401993000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=       0   
2   1548500402853000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=       0   
3   1548500402957000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=       0   
4   1548500404848000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=       0   
5   1548500405033000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=       0   
6   1548500406845000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=       0   
7   1548500406994000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=       0   
8   1548500408853000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=       0   
9   1548500409065000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=       0   
10  1548500410853000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=       0   
11  1548500411095000000  j1E

In [10]:
import json

expt_settings = []
with open('expt_settings', 'r') as f:
    for line in f:
        # Skip line number if present
        line = line.strip()
        if not line:
            continue
        # Split off the line number if present (e.g., "1 {json}")
        parts = line.split(' ', 1)
        if len(parts) == 2:
            _, json_str = parts
        else:
            json_str = parts[0]
        obj = json.loads(json_str)
        expt_settings.append(obj)

print(expt_settings)

[{'cc': 'cubic', 'abr': 'linear_bba', 'git_commit': '15c299ee7c9f7b5641531ac9c73a6a8c41054532'}, {'cc': 'bbr', 'abr': 'linear_bba', 'git_commit': '15c299ee7c9f7b5641531ac9c73a6a8c41054532'}, {'log_dir': '/home/puffer/puffer/src/monitoring', 'ws_port': 9361, 'channels': ['abc', 'nbc', 'cbs', 'fox', 'pbs', 'cw'], 'media_dir': '../media', 'abr_configs': {'mpc': {'dis_buf_length': 40, 'ssim_diff_coeff': 0.3, 'past_chunk_count': 5, 'rebuffer_length_coeff': 10}, 'linear_bba': {'lower_reservoir': 0.2, 'upper_reservoir': 0.8}, 'mpc_search': {'ssim_diff_coeff': 0.3, 'past_chunk_count': 5, 'rebuffer_length_coeff': 10}}, 'abr_algorithm': 'mpc', 'enable_logging': False, 'channel_configs': {'cw': {'live': False, 'audio': ['128k'], 'video': {'1280x720': [23], '1920x1080': [23]}, 'present_delay_chunk': 20}, 'abc': {'live': False, 'audio': ['64k'], 'video': {'256x144': [13], '426x240': [13, 18], '854x480': [18, 23], '1280x720': [23]}}, 'cbs': {'live': False, 'audio': ['64k'], 'video': {'256x144': [13]

In [11]:
data = merged
print(data.columns)

Index(['time (ns GMT)', 'session_id', 'index_x', 'expt_id_x', 'channel_x',
       'video_ts', 'format', 'size', 'ssim_index', 'cwnd', 'in_flight',
       'min_rtt', 'rtt', 'delivery_rate', 'source_dataset_x', 'index_y',
       'expt_id_y', 'channel_y', 'event', 'buffer', 'cum_rebuf',
       'source_dataset_y'],
      dtype='object')


In [12]:
def split_dfs(data):
    # Split the dataframe based on unique values in the 'category' column
    return {group: group_df for group, group_df in data.groupby('expt_id')}

def split_dfs_merged(data):
    # Split the dataframe based on unique values in the 'category' column
    return {group: group_df for group, group_df in data.groupby('expt_id_x')}

split_dataframes = split_dfs_merged(data)

# Create base output directory
base_dir = "ABR_Separated"

# Make sure the base directory exists
os.makedirs(base_dir, exist_ok=True)

# Dictionary to keep count of files saved per ABR
abr_counts = {}
# Now split_dfs is a dictionary where the key is the category, and the value is the corresponding DataFrame
for key, sub_df in split_dataframes.items():
    abr_name = expt_settings[int(key)-1]["abr"]
    abr_dir = os.path.join(base_dir, abr_name)

    # Make sure the ABR-specific directory exists
    os.makedirs(abr_dir, exist_ok=True)

    # Determine filename
    count = abr_counts.get(abr_name, 0) + 1
    abr_counts[abr_name] = count
    filename = f"{count}.csv"

    # Full path to save
    file_path = os.path.join(abr_dir, filename)
    sub_df.to_csv(file_path, index=False)

    print(f"Saved: {file_path}")

Saved: ABR_Separated/linear_bba/1.csv
Saved: ABR_Separated/robust_mpc/1.csv
