In [1]:
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset folders
dataset_list = ['video_sent_2025-05-19T11_2025-05-20T11']

current_dir = os.getcwd()
base_path = '' + current_dir + '/puffer_tests/'
print(base_path)
# base_path = './test_data'

# Quality labels to predict
quality_labels = ['144', '240', '360', '480', '720', '1080', '1440', '2160']

# Collect all merged CSVs
all_dfs = []
print("Loading CSV files...")
for dataset in tqdm(dataset_list, desc="Datasets"):
    file_path = os.path.join(base_path + dataset + '.csv')  # <-- match all CSVs
    
    try:
        df = pd.read_csv(file_path, dtype=str)
        print(f"Loaded {file_path} with shape {df.shape}")
        df['source_dataset'] = dataset  # Optional: keep track of source
        all_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
# # Combine all data

/Users/stevenjiang/Documents/GitHub/293N-YT-ABR-Prediction/puffer_tests/
Loading CSV files...


Datasets: 100%|██████████| 1/1 [00:07<00:00,  7.95s/it]

Loaded /Users/stevenjiang/Documents/GitHub/293N-YT-ABR-Prediction/puffer_tests/video_sent_2025-05-19T11_2025-05-20T11.csv with shape (4440577, 16)





In [2]:
print(all_dfs)

[               time (ns GMT)                                    session_id  \
0        1747652401422000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
1        1747652402461000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
2        1747652403339000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
3        1747652404440000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
4        1747652405154000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
...                      ...                                           ...   
4440572  1747730064822000000  C+kmcRJUwenRw/58Sx+6hly75v3hcJxhZh8vcOM1B0o=   
4440573  1747730065759000000  C+kmcRJUwenRw/58Sx+6hly75v3hcJxhZh8vcOM1B0o=   
4440574  1747730066515000000  C+kmcRJUwenRw/58Sx+6hly75v3hcJxhZh8vcOM1B0o=   
4440575  1747730066964000000  C+kmcRJUwenRw/58Sx+6hly75v3hcJxhZh8vcOM1B0o=   
4440576  1747730068026000000  C+kmcRJUwenRw/58Sx+6hly75v3hcJxhZh8vcOM1B0o=   

        index expt_id channel     video_ts        format    si

In [3]:
# Combine all data
print("Combining CSVs into one DataFrame...")
data = pd.concat(all_dfs, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

Combining CSVs into one DataFrame...
Total rows loaded: 4440577


In [4]:
import json

expt_settings = []
with open('expt_settings', 'r') as f:
    for line in f:
        # Skip line number if present
        line = line.strip()
        if not line:
            continue
        # Split off the line number if present (e.g., "1 {json}")
        parts = line.split(' ', 1)
        if len(parts) == 2:
            _, json_str = parts
        else:
            json_str = parts[0]
        obj = json.loads(json_str)
        expt_settings.append(obj)

print(expt_settings)

[{'cc': 'cubic', 'abr': 'linear_bba', 'git_commit': '15c299ee7c9f7b5641531ac9c73a6a8c41054532'}, {'cc': 'bbr', 'abr': 'linear_bba', 'git_commit': '15c299ee7c9f7b5641531ac9c73a6a8c41054532'}, {'log_dir': '/home/puffer/puffer/src/monitoring', 'ws_port': 9361, 'channels': ['abc', 'nbc', 'cbs', 'fox', 'pbs', 'cw'], 'media_dir': '../media', 'abr_configs': {'mpc': {'dis_buf_length': 40, 'ssim_diff_coeff': 0.3, 'past_chunk_count': 5, 'rebuffer_length_coeff': 10}, 'linear_bba': {'lower_reservoir': 0.2, 'upper_reservoir': 0.8}, 'mpc_search': {'ssim_diff_coeff': 0.3, 'past_chunk_count': 5, 'rebuffer_length_coeff': 10}}, 'abr_algorithm': 'mpc', 'enable_logging': False, 'channel_configs': {'cw': {'live': False, 'audio': ['128k'], 'video': {'1280x720': [23], '1920x1080': [23]}, 'present_delay_chunk': 20}, 'abc': {'live': False, 'audio': ['64k'], 'video': {'256x144': [13], '426x240': [13, 18], '854x480': [18, 23], '1280x720': [23]}}, 'cbs': {'live': False, 'audio': ['64k'], 'video': {'256x144': [13]

In [5]:
def split_dfs(data):
    # Split the dataframe based on unique values in the 'category' column
    return {group: group_df for group, group_df in data.groupby('expt_id')}


split_dataframes = split_dfs(data)

# Now split_dfs is a dictionary where the key is the category, and the value is the corresponding DataFrame
for key, sub_df in split_dataframes.items():
    print(f"Group {key}:{expt_settings[int(key)-1]}\n{sub_df}\n")

Group 2216:{'cc': 'bbr', 'abr': 'puffer_ttp', 'abr_name': 'puffer_ttp_20190202', 'abr_config': {'model_dir': '/home/puffer/puffer/models/puffer_ttp/bbr-20190202-1', 'rebuffer_length_coeff': 100}, 'git_commit': '91d93f4a01646bc17f1c8c61e5a9bd29269011f2'}
              time (ns GMT)                                    session_id  \
0       1747652401422000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
1       1747652402461000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
2       1747652403339000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
3       1747652404440000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
4       1747652405154000000  B9FX9GRZgwn32Vqzw4tHgR2Xn+AnRPVIRTg9RvrxZC0=   
...                     ...                                           ...   
566801  1747733970966000000  uuS7DIRtiniqhMNPT5YY8n72Q2NnoMP+2tajbu2XlsE=   
566802  1747733972457000000  uuS7DIRtiniqhMNPT5YY8n72Q2NnoMP+2tajbu2XlsE=   
566803  1747733977523000000  uuS7DIRtiniqhMNPT5YY8n72