In [None]:
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset folders
# video_size_2019-01-26T11_2019-01-27T11
dataset_list = ['video_sent_2019-01-26T11_2019-01-27T11']
# THIS BUFFER LIST IS NEEDED PAST 2021-06-12T11_2021-06-13T11
ssim_list = ['ssim_2019-01-26T11_2019-01-27T11']
buffer_list = ['client_buffer_2019-01-26T11_2019-01-27T11']

current_dir = os.getcwd()
base_path = '' + current_dir + '/puffer_tests/'
print(base_path)
# base_path = './test_data'

# Collect all merged CSVs
all_dfs = []
print("Loading CSV files...")
for dataset in tqdm(dataset_list, desc="Datasets"):
    file_path = os.path.join(base_path + dataset + '.csv')  # <-- match all CSVs
    
    try:
        df = pd.read_csv(file_path, dtype=str)
        print(f"Loaded {file_path} with shape {df.shape}")
        df['source_dataset'] = dataset  # Optional: keep track of source
        all_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

all_buffer_dfs = []
for client_buffer_dataset in tqdm(buffer_list, desc="Datasets"):
    file_path = os.path.join(base_path + client_buffer_dataset + '.csv')  # <-- match all CSVs
    
    try:
        df = pd.read_csv(file_path, dtype=str)
        print(f"Loaded {file_path} with shape {df.shape}")
        df['source_dataset'] = dataset  # Optional: keep track of source
        all_buffer_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
# # Combine all data

c:\Users\richa\Desktop\CodingWorkspaces\293N-YT-ABR-Prediction/puffer_tests/
Loading CSV files...


Datasets: 100%|██████████| 1/1 [00:02<00:00,  2.24s/it]

Loaded c:\Users\richa\Desktop\CodingWorkspaces\293N-YT-ABR-Prediction/puffer_tests/video_sent_2019-01-26T11_2019-01-27T11.csv with shape (1235841, 14)



Datasets: 100%|██████████| 1/1 [00:06<00:00,  6.21s/it]

Loaded c:\Users\richa\Desktop\CodingWorkspaces\293N-YT-ABR-Prediction/puffer_tests/client_buffer_2019-01-26T11_2019-01-27T11.csv with shape (7277113, 8)





In [27]:
print(all_buffer_dfs)

[               time (ns GMT)                                    session_id  \
0        1548500400094000000  u9x7414eluleHfu/Yj/zN+kBoR0qhsLoTS3q8eY/cDw=   
1        1548500400345000000  u9x7414eluleHfu/Yj/zN+kBoR0qhsLoTS3q8eY/cDw=   
2        1548500400602000000  u9x7414eluleHfu/Yj/zN+kBoR0qhsLoTS3q8eY/cDw=   
3        1548500400848000000  u9x7414eluleHfu/Yj/zN+kBoR0qhsLoTS3q8eY/cDw=   
4        1548500401105000000  u9x7414eluleHfu/Yj/zN+kBoR0qhsLoTS3q8eY/cDw=   
...                      ...                                           ...   
7277108  1548566874154000000  4wmWwNYabo+MwE+CkLWjwA0KlIc2O/k+8x1Q7fG4FY4=   
7277109  1548567021764000000  xqm9Sp+bLsMACIc2SGWgn6XZ3SowRDBHTxDupWRVQ90=   
7277110  1548551314618000000  Yjg/HtE4DhkJ61GmeynBuIWVjsUe9fZDoXHsNTteUsY=   
7277111  1548560902805000000  bwq3AV0IGJOWoLtm9P0compYqM/qzEM2B0Eg61N1CZc=   
7277112  1548560902805000000  vL7GzThsAn5QGdnsJA4UwQNRoMPUXzDzHWndetUP4Xk=   

        index expt_id channel  event  buffer cum_rebuf  \
0   

In [28]:
# Combine all data
print("Combining CSVs into one DataFrame...")
data = pd.concat(all_dfs, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

Combining CSVs into one DataFrame...
Total rows loaded: 1235841


In [29]:
print("Combining CSVs into one DataFrame...")
buffer_data = pd.concat(all_buffer_dfs, ignore_index=True)
print(f"Total rows loaded: {len(buffer_data)}")

Combining CSVs into one DataFrame...
Total rows loaded: 7277113


In [30]:
print(data.columns)
print(buffer_data.columns)
# Convert both to integers if they aren't already
# data['time (ns GMT)'] = pd.to_numeric(data['time (ns GMT)'], errors='raise')
# buffer_data['time (ns GMT)'] = pd.to_numeric(buffer_data['time (ns GMT)'], errors='raise')
# # Drop rows with null time values (from coercion or original data)
# data = data.dropna(subset=['time (ns GMT)', 'session_id'])
# buffer_data = buffer_data.dropna(subset=['time (ns GMT)', 'session_id'])

Index(['time (ns GMT)', 'session_id', 'index', 'expt_id', 'channel',
       'video_ts', 'format', 'size', 'ssim_index', 'cwnd', 'in_flight',
       'min_rtt', 'rtt', 'delivery_rate', 'source_dataset'],
      dtype='object')
Index(['time (ns GMT)', 'session_id', 'index', 'expt_id', 'channel', 'event',
       'buffer', 'cum_rebuf', 'source_dataset'],
      dtype='object')


In [None]:
# Ensure the time column is numeric
data['time (ns GMT)'] = pd.to_numeric(data['time (ns GMT)'], errors='coerce')
buffer_data['time (ns GMT)'] = pd.to_numeric(buffer_data['time (ns GMT)'], errors='coerce')

# Drop rows with null time values (from coercion or original data)
data = data.dropna(subset=['time (ns GMT)', 'session_id'])
buffer_data = buffer_data.dropna(subset=['time (ns GMT)', 'session_id'])

# Sort both dataframes properly
data_sorted = data.sort_values(['time (ns GMT)', 'session_id']).reset_index(drop=True)
buffer_sorted = buffer_data.sort_values(['time (ns GMT)', 'session_id']).reset_index(drop=True)
# Merge asof (requires full sort on the merge key)
merged = pd.merge_asof(
    data_sorted,
    buffer_sorted,
    on='time (ns GMT)',
    by='session_id',
    direction='backward',
    tolerance=10**9 * 10  # 10 seconds
)


In [35]:
print(merged)

               time (ns GMT)                                    session_id  \
0        1548500400111000000  aWhUcIUav1X7UUjGHfMmBaGc7csK55eo+BctXUKCVFs=   
1        1548500400329000000  0FWBnkR6ClBRXGEZYDpHQ54JMC9j7UDiwm3Fl3HIPZw=   
2        1548500400332000000  c2Gkk+8HfE6AmtgjGRP+tJC4wPZq5U+KAXdJhcBBzKs=   
3        1548500400373000000  fciPebAXJYTgwuEwS8Pd9wRVEfd0w3ql1/Nwzhjg2Vs=   
4        1548500400444000000  cU1PVL6KeAMIvWujiTuJUjhvIsqhjYr24yUoNip0wl8=   
...                      ...                                           ...   
1235836  1548586799022000000  u6oPTyb9copR7Buqvetl8/KNXHUX+PSPQrixfmIdqhk=   
1235837  1548586799141000000  aWhUcIUav1X7UUjGHfMmBaGc7csK55eo+BctXUKCVFs=   
1235838  1548586799577000000  6kZS7bk9TpUTijH2AYl9W/UcFU+idsMXOnhuBfQAnBI=   
1235839  1548586799951000000  z0SP/ffaqUbDaMyxVAtPjlR7BksXU47V0XnbMPEg9IU=   
1235840  1548586799988000000  7+pBywI9ZapvhuLBGCl67y/fMKmxAHS5X/jwrEmgzrQ=   

        index_x expt_id_x channel_x     video_ts        format 

In [36]:
import json

expt_settings = []
with open('expt_settings', 'r') as f:
    for line in f:
        # Skip line number if present
        line = line.strip()
        if not line:
            continue
        # Split off the line number if present (e.g., "1 {json}")
        parts = line.split(' ', 1)
        if len(parts) == 2:
            _, json_str = parts
        else:
            json_str = parts[0]
        obj = json.loads(json_str)
        expt_settings.append(obj)

print(expt_settings)

[{'cc': 'cubic', 'abr': 'linear_bba', 'git_commit': '15c299ee7c9f7b5641531ac9c73a6a8c41054532'}, {'cc': 'bbr', 'abr': 'linear_bba', 'git_commit': '15c299ee7c9f7b5641531ac9c73a6a8c41054532'}, {'log_dir': '/home/puffer/puffer/src/monitoring', 'ws_port': 9361, 'channels': ['abc', 'nbc', 'cbs', 'fox', 'pbs', 'cw'], 'media_dir': '../media', 'abr_configs': {'mpc': {'dis_buf_length': 40, 'ssim_diff_coeff': 0.3, 'past_chunk_count': 5, 'rebuffer_length_coeff': 10}, 'linear_bba': {'lower_reservoir': 0.2, 'upper_reservoir': 0.8}, 'mpc_search': {'ssim_diff_coeff': 0.3, 'past_chunk_count': 5, 'rebuffer_length_coeff': 10}}, 'abr_algorithm': 'mpc', 'enable_logging': False, 'channel_configs': {'cw': {'live': False, 'audio': ['128k'], 'video': {'1280x720': [23], '1920x1080': [23]}, 'present_delay_chunk': 20}, 'abc': {'live': False, 'audio': ['64k'], 'video': {'256x144': [13], '426x240': [13, 18], '854x480': [18, 23], '1280x720': [23]}}, 'cbs': {'live': False, 'audio': ['64k'], 'video': {'256x144': [13]

In [38]:
data = merged
print(data.columns)

Index(['time (ns GMT)', 'session_id', 'index_x', 'expt_id_x', 'channel_x',
       'video_ts', 'format', 'size', 'ssim_index', 'cwnd', 'in_flight',
       'min_rtt', 'rtt', 'delivery_rate', 'source_dataset_x', 'index_y',
       'expt_id_y', 'channel_y', 'event', 'buffer', 'cum_rebuf',
       'source_dataset_y'],
      dtype='object')


In [40]:
def split_dfs(data):
    # Split the dataframe based on unique values in the 'category' column
    return {group: group_df for group, group_df in data.groupby('expt_id')}

def split_dfs_merged(data):
    # Split the dataframe based on unique values in the 'category' column
    return {group: group_df for group, group_df in data.groupby('expt_id_x')}

split_dataframes = split_dfs_merged(data)

# Create base output directory
base_dir = "ABR_Separated"

# Make sure the base directory exists
os.makedirs(base_dir, exist_ok=True)

# Dictionary to keep count of files saved per ABR
abr_counts = {}
# Now split_dfs is a dictionary where the key is the category, and the value is the corresponding DataFrame
for key, sub_df in split_dataframes.items():
    abr_name = expt_settings[int(key)-1]["abr"]
    abr_dir = os.path.join(base_dir, abr_name)

    # Make sure the ABR-specific directory exists
    os.makedirs(abr_dir, exist_ok=True)

    # Determine filename
    count = abr_counts.get(abr_name, 0) + 1
    abr_counts[abr_name] = count
    filename = f"{count}.csv"

    # Full path to save
    file_path = os.path.join(abr_dir, filename)
    sub_df.to_csv(file_path, index=False)

    print(f"Saved: {file_path}")

Saved: ABR_Separated\puffer_ttp\1.csv
Saved: ABR_Separated\puffer_ttp\2.csv
Saved: ABR_Separated\linear_bba\1.csv
Saved: ABR_Separated\linear_bba\2.csv
Saved: ABR_Separated\mpc\1.csv
Saved: ABR_Separated\mpc\2.csv
Saved: ABR_Separated\robust_mpc\1.csv
Saved: ABR_Separated\robust_mpc\2.csv
Saved: ABR_Separated\pensieve\1.csv
Saved: ABR_Separated\pensieve\2.csv
Saved: ABR_Separated\puffer_ttp\3.csv
Saved: ABR_Separated\puffer_ttp\4.csv
Saved: ABR_Separated\puffer_ttp\5.csv
Saved: ABR_Separated\puffer_ttp\6.csv
Saved: ABR_Separated\puffer_ttp\7.csv
Saved: ABR_Separated\linear_bba\3.csv
Saved: ABR_Separated\linear_bba\4.csv
Saved: ABR_Separated\mpc\3.csv
Saved: ABR_Separated\mpc\4.csv
Saved: ABR_Separated\robust_mpc\3.csv
Saved: ABR_Separated\robust_mpc\4.csv
Saved: ABR_Separated\pensieve\3.csv
Saved: ABR_Separated\pensieve\4.csv
Saved: ABR_Separated\puffer_ttp\8.csv
Saved: ABR_Separated\puffer_ttp\9.csv
Saved: ABR_Separated\puffer_ttp\10.csv
Saved: ABR_Separated\puffer_ttp\11.csv
Saved: A