In [62]:
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Base path where all method folders are
METHOD = "mpc"
base_path = f'./ABR_Separated/{METHOD}/'

# Collect all merged CSVs
all_dfs = []

print("Loading CSV files...")

# Loop through each ABR method folder
path_pattern = os.path.join(base_path, '*.csv')  # Match all CSVs under each method folder
for file_path in glob(path_pattern):
    try:
        df = pd.read_csv(file_path, dtype=str)
        all_dfs.append(df)
        print(df.head())
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

Loading CSV files...
         time (ns GMT)                                    session_id  \
0  1548500400373000000  fciPebAXJYTgwuEwS8Pd9wRVEfd0w3ql1/Nwzhjg2Vs=   
1  1548500401329000000  GzdpvodlEXYvK2J5jeT0ZAk1wVaDLd+wIr75A+RCCaY=   
2  1548500401728000000  QlFt5Mb2B+CQBvIjVPYTC0H94KAKcSSiDDYwd0Z2sHc=   
3  1548500402374000000  fciPebAXJYTgwuEwS8Pd9wRVEfd0w3ql1/Nwzhjg2Vs=   
4  1548500403329000000  GzdpvodlEXYvK2J5jeT0ZAk1wVaDLd+wIr75A+RCCaY=   

    video_ts_x      format_x   size_x ssim_index_x cwnd_x in_flight_x  \
0  38277619380   1280x720-20   692788     0.983757     61           0   
1  38254916700  1920x1080-22   221530     0.986023     47           0   
2  38278520280  1920x1080-22  1192372     0.984555     27           0   
3  38277799560   1280x720-20   807355     0.982235     65           0   
4  38255096880  1920x1080-22   448644     0.987116     48           0   

  min_rtt_x  rtt_x  ...   video_ts_y      format_y   size_y ssim_index_y  \
0     46798  47318  ...  382776

In [63]:
# Combine all data
print("Combining CSVs into one DataFrame...")
data = pd.concat(all_dfs, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

Combining CSVs into one DataFrame...
Total rows loaded: 124482


In [64]:
# Drop columns
# All columns
# Index(['time (ns GMT)', 'session_id', 'video_ts_x', 'format_x', 'size_x',
    #    'ssim_index_x', 'cwnd_x', 'in_flight_x', 'min_rtt_x', 'rtt_x',
    #    'delivery_rate_x', 'index_y', 'expt_id_y', 'channel_y', 'event_x',
    #    'buffer_x', 'cum_rebuf_x', 'source_dataset_y', 'index_x', 'expt_id_x',
    #    'channel_x', 'event_y', 'buffer_y', 'cum_rebuf_y', 'source_dataset_x',
    #    'index_y.1', 'expt_id_y.1', 'channel_y.1', 'video_ts_y', 'format_y',
    #    'size_y', 'ssim_index_y', 'cwnd_y', 'in_flight_y', 'min_rtt_y', 'rtt_y',
    #    'delivery_rate_y', 'source_dataset_y.1'],
# Why i dropped: https://puffer.stanford.edu/data-description/
non_mpc_columns = [
    'time (ns GMT)', 'session_id', 'video_ts_x', 'ssim_index_x',
    'index_y', 'expt_id_y', 'channel_y', 'source_dataset_y', "index_x", "expt_id_x",
    "channel_x", "event_y", "buffer_y", "cum_rebuf_y", "source_dataset_x", 
    'index_y.1', 'expt_id_y.1', 'channel_y.1', 'video_ts_y', 'format_y',
    'size_y', 'ssim_index_y', 'cwnd_y', 'in_flight_y', 'min_rtt_y', 'rtt_y',
    'delivery_rate_y', 'source_dataset_y.1'
]

data = data.drop(columns=non_mpc_columns)
print(data.columns)


Index(['format_x', 'size_x', 'cwnd_x', 'in_flight_x', 'min_rtt_x', 'rtt_x',
       'delivery_rate_x', 'event_x', 'buffer_x', 'cum_rebuf_x'],
      dtype='object')


In [65]:
quality_labels = data["format_x"].unique()
print(quality_labels)

['1280x720-20' '1920x1080-22' '854x480-24' '1280x720-26' '1920x1080-24'
 '640x360-24' '1280x720-22' '1280x720-24' '854x480-26' '426x240-26'
 '854x480-22' '640x360-26']


In [66]:
# # Step 4: Map to integer class labels
# quality_to_index = {q: i for i, q in enumerate(quality_labels)}
# data['PlaybackQuality'] = data['format_x'].map(quality_to_index)

# # Step 5: Drop rows with missing PlaybackQuality
# data = data.dropna(subset=['PlaybackQuality'])
data["PlaybackQuality"] = data["format_x"]
data.drop(columns=['format_x'], inplace=True)

In [67]:
event_labels = data["event_x"].unique()
event_to_index = {q: i for i, q in enumerate(event_labels)}
data['event_x'] = data['event_x'].map(event_to_index)

print(data.head())

    size_x cwnd_x in_flight_x min_rtt_x  rtt_x delivery_rate_x  event_x  \
0   692788     61           0     46798  47318         1771511        0   
1   221530     47           0     40295  55131         1011604        0   
2  1192372     27           0     25770  29304         1274852        0   
3   807355     65           0     46798  47072         1828059        0   
4   448644     48           0     40295  50293         1123852        0   

  buffer_x cum_rebuf_x PlaybackQuality  
0   14.135       0.682     1280x720-20  
1   14.814       0.287    1920x1080-22  
2   14.591       0.115    1920x1080-22  
3    14.11       0.682     1280x720-20  
4   14.816       0.287    1920x1080-22  


In [68]:
data.head()

Unnamed: 0,size_x,cwnd_x,in_flight_x,min_rtt_x,rtt_x,delivery_rate_x,event_x,buffer_x,cum_rebuf_x,PlaybackQuality
0,692788,61,0,46798,47318,1771511,0,14.135,0.682,1280x720-20
1,221530,47,0,40295,55131,1011604,0,14.814,0.287,1920x1080-22
2,1192372,27,0,25770,29304,1274852,0,14.591,0.115,1920x1080-22
3,807355,65,0,46798,47072,1828059,0,14.11,0.682,1280x720-20
4,448644,48,0,40295,50293,1123852,0,14.816,0.287,1920x1080-22


In [69]:
# Wrap in tqdm for progress bar
for col in tqdm(data.columns, desc="Converting columns to numeric"):
    if col == 'PlaybackQuality':
        continue
    data[col] = pd.to_numeric(data[col], errors='ignore')

  data[col] = pd.to_numeric(data[col], errors='ignore')
Converting columns to numeric: 100%|██████████| 10/10 [00:00<00:00, 26.92it/s]


In [70]:
print(data.head())

    size_x  cwnd_x  in_flight_x  min_rtt_x  rtt_x  delivery_rate_x  event_x  \
0   692788      61            0      46798  47318          1771511        0   
1   221530      47            0      40295  55131          1011604        0   
2  1192372      27            0      25770  29304          1274852        0   
3   807355      65            0      46798  47072          1828059        0   
4   448644      48            0      40295  50293          1123852        0   

   buffer_x  cum_rebuf_x PlaybackQuality  
0    14.135        0.682     1280x720-20  
1    14.814        0.287    1920x1080-22  
2    14.591        0.115    1920x1080-22  
3    14.110        0.682     1280x720-20  
4    14.816        0.287    1920x1080-22  


In [71]:
# # Drop any remaining non-numeric columns
# # data = data.select_dtypes(include=['number', 'bool'])
# data = data.select_dtypes(include=['number'])
# print(data)

In [72]:
# Encode labels
le = LabelEncoder()
# data['PlaybackQuality'] = le.fit_transform(data['PlaybackQuality'])

# Separate features and labels
X = data.drop(columns=['PlaybackQuality'])
y = pd.Categorical(data['PlaybackQuality'])

In [73]:
print(y)
print(X)

['1280x720-20', '1920x1080-22', '1920x1080-22', '1280x720-20', '1920x1080-22', ..., '1920x1080-22', '1920x1080-22', '1280x720-20', '1920x1080-22', '1920x1080-22']
Length: 124482
Categories (12, object): ['1280x720-20', '1280x720-22', '1280x720-24', '1280x720-26', ..., '640x360-26', '854x480-22', '854x480-24', '854x480-26']
         size_x  cwnd_x  in_flight_x  min_rtt_x  rtt_x  delivery_rate_x  \
0        692788      61            0      46798  47318          1771511   
1        221530      47            0      40295  55131          1011604   
2       1192372      27            0      25770  29304          1274852   
3        807355      65            0      46798  47072          1828059   
4        448644      48            0      40295  50293          1123852   
...         ...     ...          ...        ...    ...              ...   
124477  1863283     498            0      37273  71829          5895927   
124478  1298872     104            0       6103  14452          6367284   


In [74]:

# Split
print("Splitting into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
print("Training Random Forest Classifier...")
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
print("Evaluating model...")
y_pred = clf.predict(X_test)

Splitting into training and test sets...
Training Random Forest Classifier...
Evaluating model...


In [80]:
print(y_pred)
print(y_test)

['1280x720-20' '1920x1080-22' '1280x720-20' ... '1920x1080-22'
 '1920x1080-22' '1920x1080-22']
['1280x720-20', '1920x1080-22', '1280x720-20', '1920x1080-22', '1280x720-22', ..., '1280x720-22', '1920x1080-22', '1920x1080-22', '1280x720-20', '1920x1080-22']
Length: 24897
Categories (12, object): ['1280x720-20', '1280x720-22', '1280x720-24', '1280x720-26', ..., '640x360-26', '854x480-22', '854x480-24', '854x480-26']


In [None]:
# # Convert the integer labels back to string labels
# y_test_str = y.categories[y_test]
# y_pred_str = y.categories[y_pred]
# # Classification report with string labels
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=y.categories))
# Confusion matrix
plt.figure(figsize=(10, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d',
            xticklabels=le.classes_, yticklabels=le.classes_, cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

Classification Report:


AttributeError: 'LabelEncoder' object has no attribute 'classes_'

In [None]:
importances = clf.feature_importances_
feat_names = X.columns

for name, imp in sorted(zip(feat_names, importances), key=lambda x: -x[1])[:20]:
    print(f"{name}: {imp:.4f}")

cum_rebuf_x: 0.2213
min_rtt_x: 0.1930
size_x: 0.1515
rtt_x: 0.1273
buffer_x: 0.1116
cwnd_x: 0.0938
delivery_rate_x: 0.0899
in_flight_x: 0.0103
event_x: 0.0013


In [None]:
from trustee import ClassificationTrustee

trustee = ClassificationTrustee(expert=clf)
trustee.fit(X_train, y_train, num_iter=50, num_stability_iter=10, samples_size=0.3, verbose=True)

dt, pruned_dt, agreement, reward = trustee.explain()
dt_y_pred = dt.predict(X_test)

from sklearn.metrics import classification_report
print("Fidelity to original model:")
print(classification_report(clf.predict(X_test), dt_y_pred))

print("Accuracy vs. true labels:")
print(classification_report(y_test, dt_y_pred))


Initializing training dataset using RandomForestClassifier(n_estimators=10, random_state=42) as expert model
Expert model score: 0.9817476539293873
Initializing Trustee outer-loop with 10 iterations
########## Outer-loop Iteration 0/10 ##########
Initializing Trustee inner-loop with 10 iterations
########## Inner-loop Iteration 0/50 ##########
Sampling 20912 points from training dataset with (69709, 69709) entries
Student model 0-0 trained with depth 35 and 2579 leaves:
Student model score: 0.28602946059451867
Student model 0-0 fidelity: 0.28602946059451867
########## Inner-loop Iteration 1/50 ##########
Sampling 20912 points from training dataset with (75983, 75983) entries
Student model 0-1 trained with depth 36 and 2496 leaves:
Student model score: 0.34710369306116023
Student model 0-1 fidelity: 0.34710369306116023
########## Inner-loop Iteration 2/50 ##########
Sampling 20912 points from training dataset with (82257, 82257) entries
Student model 0-2 trained with depth 31 and 2570 l



In [None]:
import os

# importing required libraries
# importing Scikit-learn library and datasets package
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

from trustee.report.trust import TrustReport

OUTPUT_PATH = "out/"
REPORT_PATH = f"{OUTPUT_PATH}/report/trust_report.obj"

if os.path.exists(REPORT_PATH):
    print(f"Loading trust report from {REPORT_PATH}...")
    trust_report = TrustReport.load(REPORT_PATH)
    print("Done!")
else:
    # The trust report (can) fit and explain the classifier
    trust_report = TrustReport(
        clf,
        X=X,
        y=y,
        max_iter=5,
        num_pruning_iter=5,
        train_size=0.7,
        trustee_num_iter=10,
        trustee_num_stability_iter=5,
        trustee_sample_size=0.3,
        analyze_branches=True,
        analyze_stability=True,
        top_k=10,
        verbose=True,
        class_names=iris.target_names,
        feature_names=iris.feature_names,
        is_classify=True,
    )

print(trust_report)
trust_report.save(OUTPUT_PATH)

AttributeError: 'NoneType' object has no attribute 'shape'