In [21]:
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [22]:
import os
import pandas as pd
import glob

# Base directory for datasets
current_dir = os.getcwd()
base_path = os.path.join(current_dir, 'puffer_tests')

# Function to load and concatenate all CSVs in a folder
def load_all_csvs(folder_path, source_label):
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    df_list = []
    
    for file_path in csv_files:
        try:
            df = pd.read_csv(file_path, dtype=str)
            df['source_dataset'] = source_label
            df_list.append(df)
            print(f"Loaded {file_path} with shape {df.shape}")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    
    if df_list:
        return pd.concat(df_list, ignore_index=True)
    else:
        return pd.DataFrame()  # Empty DataFrame if nothing loaded

# Load and concatenate video_sent CSVs
video_sent_path = os.path.join(base_path, 'video_sent')
data = load_all_csvs(video_sent_path, 'video_sent')

# Load and concatenate client_buffer CSVs
client_buffer_path = os.path.join(base_path, 'client_buffer')
buffer_data = load_all_csvs(client_buffer_path, 'client_buffer')

Loaded c:\Users\richa\Desktop\CodingWorkspaces\bruh\293N-YT-ABR-Prediction\puffer_tests\video_sent\video_sent_2019-01-26T11_2019-01-27T11.csv with shape (49, 15)
Loaded c:\Users\richa\Desktop\CodingWorkspaces\bruh\293N-YT-ABR-Prediction\puffer_tests\video_sent\video_sent_2019-01-27T11_2019-01-28T11.csv with shape (1634093, 15)
Loaded c:\Users\richa\Desktop\CodingWorkspaces\bruh\293N-YT-ABR-Prediction\puffer_tests\video_sent\video_sent_2019-01-28T11_2019-01-29T11.csv with shape (936947, 15)
Loaded c:\Users\richa\Desktop\CodingWorkspaces\bruh\293N-YT-ABR-Prediction\puffer_tests\video_sent\video_sent_2019-01-29T11_2019-01-30T11.csv with shape (682724, 15)
Loaded c:\Users\richa\Desktop\CodingWorkspaces\bruh\293N-YT-ABR-Prediction\puffer_tests\video_sent\video_sent_2019-01-30T11_2019-01-31T11.csv with shape (761778, 15)
Loaded c:\Users\richa\Desktop\CodingWorkspaces\bruh\293N-YT-ABR-Prediction\puffer_tests\video_sent\video_sent_2019-01-31T11_2019-02-01T11.csv with shape (496127, 15)
Loaded 

In [23]:
# Ensure the time column is numeric
data['time (ns GMT)'] = pd.to_numeric(data['time (ns GMT)'], errors='coerce')
buffer_data['time (ns GMT)'] = pd.to_numeric(buffer_data['time (ns GMT)'], errors='coerce')

# Drop rows with null time values (from coercion or original data)
data = data.dropna(subset=['time (ns GMT)', 'session_id'])
buffer_data = buffer_data.dropna(subset=['time (ns GMT)', 'session_id'])

# Sort both dataframes properly
data_sorted = data.sort_values(['time (ns GMT)', 'session_id']).reset_index(drop=True)
buffer_sorted = buffer_data.sort_values(['time (ns GMT)', 'session_id']).reset_index(drop=True)

In [24]:
# Merge asof (requires full sort on the merge key)
merged = pd.merge_asof(
    data_sorted,
    buffer_sorted,
    on='time (ns GMT)',
    by='session_id',
    direction='backward',
    tolerance=10**9 * 10  # 10 seconds
)

print(merged)

                time (ns GMT)                                    session_id  \
0         1548500400788000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=   
1         1548500401993000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=   
2         1548500402853000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=   
3         1548500402957000000  j1E2fJfiOe5TU3Fj3lC1cgA4CsKjg2NJYBcxdVcdJfk=   
4         1548500404848000000  kaXlFjXCNjAgx4nsvYsDetENDzv04n/e7R4HYutFWt0=   
...                       ...                                           ...   
11897054  1549796398277000000  XWTlvZkd85BqdqcXwyqVf6IY7F36iSIT5fyZbF8OKrc=   
11897055  1549796398504000000  E1Bxmt1OoguWfV4KRmIWTHtDxuD/gDhoa93gf1GczaE=   
11897056  1549796399405000000  WE7K4AsaLAlnlA/bvZt82D19ywRaGdzQJajYlkzsP4g=   
11897057  1549796399520000000  hGBfytiLEcfvK4VprTgO/26q3iLMIl9TzcPgXCixQlo=   
11897058  1549796399613000000  J7QeGfG3JGbFvuJTbUbYE1OXFHcVbPhvFrmnq1qiEok=   

         index_x expt_id_x channel_x     video_ts  

In [25]:
expt_settings = []
with open('expt_settings', 'r') as f:
    for line in f:
        # Skip line number if present
        line = line.strip()
        if not line:
            continue
        # Split off the line number if present (e.g., "1 {json}")
        parts = line.split(' ', 1)
        if len(parts) == 2:
            _, json_str = parts
        else:
            json_str = parts[0]
        obj = json.loads(json_str)
        expt_settings.append(obj)

In [26]:
data = merged
print(data.columns)

Index(['time (ns GMT)', 'session_id', 'index_x', 'expt_id_x', 'channel_x',
       'video_ts', 'format', 'size', 'ssim_index', 'cwnd', 'in_flight',
       'min_rtt', 'rtt', 'delivery_rate', 'source_dataset_x', 'index_y',
       'expt_id_y', 'channel_y', 'event', 'buffer', 'cum_rebuf',
       'source_dataset_y'],
      dtype='object')


In [27]:
def split_dfs(data):
    # Split the dataframe based on unique values in the 'category' column
    return {group: group_df for group, group_df in data.groupby('expt_id')}

def split_dfs_merged(data):
    # Split the dataframe based on unique values in the 'category' column
    return {group: group_df for group, group_df in data.groupby('expt_id_x')}

split_dataframes = split_dfs_merged(data)

# Create base output directory
base_dir = "ABR_Separated"

# Make sure the base directory exists
os.makedirs(base_dir, exist_ok=True)

# Dictionary to keep count of files saved per ABR
abr_counts = {}
# Now split_dfs is a dictionary where the key is the category, and the value is the corresponding DataFrame
for key, sub_df in split_dataframes.items():
    abr_name = expt_settings[int(key)-1]["abr"]
    abr_dir = os.path.join(base_dir, abr_name)

    # Make sure the ABR-specific directory exists
    os.makedirs(abr_dir, exist_ok=True)

    # Determine filename
    count = abr_counts.get(abr_name, 0) + 1
    abr_counts[abr_name] = count
    filename = f"{count}.csv"

    # Full path to save
    file_path = os.path.join(abr_dir, filename)
    sub_df.to_csv(file_path, index=False)

    print(f"Saved: {file_path}")

Saved: ABR_Separated\linear_bba\1.csv
Saved: ABR_Separated\robust_mpc\1.csv
Saved: ABR_Separated\puffer_ttp\1.csv
Saved: ABR_Separated\puffer_ttp\2.csv
Saved: ABR_Separated\puffer_ttp\3.csv
Saved: ABR_Separated\puffer_ttp\4.csv
Saved: ABR_Separated\linear_bba\2.csv
Saved: ABR_Separated\linear_bba\3.csv
Saved: ABR_Separated\mpc\1.csv
Saved: ABR_Separated\mpc\2.csv
Saved: ABR_Separated\robust_mpc\2.csv
Saved: ABR_Separated\robust_mpc\3.csv
Saved: ABR_Separated\pensieve\1.csv
Saved: ABR_Separated\pensieve\2.csv
Saved: ABR_Separated\puffer_ttp\5.csv
Saved: ABR_Separated\puffer_ttp\6.csv
Saved: ABR_Separated\puffer_ttp\7.csv
Saved: ABR_Separated\puffer_ttp\8.csv
Saved: ABR_Separated\puffer_ttp\9.csv
Saved: ABR_Separated\puffer_ttp\10.csv
Saved: ABR_Separated\linear_bba\4.csv
Saved: ABR_Separated\linear_bba\5.csv
Saved: ABR_Separated\mpc\3.csv
Saved: ABR_Separated\mpc\4.csv
Saved: ABR_Separated\robust_mpc\4.csv
Saved: ABR_Separated\robust_mpc\5.csv
Saved: ABR_Separated\pensieve\3.csv
Saved: 

In [28]:
from glob import glob
# Base path where all method folders are
METHOD = "pensieve"
base_path = f'./ABR_Separated/{METHOD}/'

# Collect all merged CSVs
all_dfs = []

print("Loading CSV files...")

# Loop through each ABR method folder
path_pattern = os.path.join(base_path, '*.csv')  # Match all CSVs under each method folder
for file_path in glob(path_pattern):
    try:
        df = pd.read_csv(file_path, dtype=str)
        all_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

Loading CSV files...


In [29]:
# Combine all data
print("Combining CSVs into one DataFrame...")
data = pd.concat(all_dfs, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

Combining CSVs into one DataFrame...
Total rows loaded: 2052559


In [30]:
print(data["format"].value_counts())


1920x1080-22    1137948
1280x720-20      599289
640x360-24        78306
1280x720-26       49695
426x240-26        32229
1280x720-24       31270
1920x1080-24      23083
640x360-26        21544
1280x720-22       19935
854x480-24        18919
854x480-26        14833
854x480-22         6387
Name: format, dtype: int64


In [None]:
# Downsample each "format" group based on the 20,000 rule
downsampled_data = (
    data.groupby("format", group_keys=False)
        .apply(lambda x: x.head(25000) if len(x) > 25000 else x)
)

# Save to CSV
downsampled_data.to_csv("downsampled_format_data.csv", index=False)

print("Saved downsampled dataset to downsampled_format_data.csv")

Saved downsampled dataset to downsampled_format_data.csv
