Read in all data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from utils import stratified_sample, annotate, even_train_split
import glob  # Importing the glob module to find all the files matching a pattern

# Pattern to match the data files
file_pattern = "data_files/user_*/metric_df.csv"

# Initialize a dictionary to store Dataframes for each dataset
all_datasets = {}

# Loop through each file that matches the file pattern
for filepath in glob.glob(file_pattern):
    # print(filepath)
    # print(filepath.split('/'))
    user_name = filepath.split('/')[1]
    print(f"Processing {filepath} dataset...")

    # Read in data file as a pandas dataframe
    data = pd.read_csv(filepath)

    all_datasets[user_name] = data

Processing data_files/user_jason/metric_df.csv dataset...
Processing data_files/user_lauren/metric_df.csv dataset...
Processing data_files/user_lizzie1/metric_df.csv dataset...
Processing data_files/user_lizzie2/metric_df.csv dataset...
Processing data_files/user_sarah1/metric_df.csv dataset...
Processing data_files/user_shreya/metric_df.csv dataset...
Processing data_files/user_sujaan/metric_df.csv dataset...
Processing data_files/user_xiao/metric_df.csv dataset...
Processing data_files/user_yutong/metric_df.csv dataset...


Modeling

In [None]:
from models import PolyRegression, GPRegression

for user, data in all_datasets.items():

	# Prepare data 
	X = data[['latency', 'scale']]
	data["performance"] = 10*data['throughput'] - data['avg_osd'] - data['avg_target_error']
	Y = data["performance"]

	n = len(data)
	n_train_values = range(2, n-2)
	for n_train in n_train_values:

		# Split into training/test sets
		X_train, X_test, Y_train, Y_test = even_train_split(data, n_train, y_metric="performance")
		# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=n_train/n)

		# Apply model, returning predictions over original dataset and dense inputs
		Y_pred, Y_pred_test, Y_pred_dense = PolyRegression(X, X_test, X_dense, X_train, Y_train, degree = 2)

		# Evaluate metrics

		# Predict over dense inputs
		latency_range = np.array(data['latency'].unique())# np.arange(0.0, 0.76, 0.01)
		scale_range = np.arange(0.0, 1.225, 0.025)

		# Create a meshgrid from the input ranges
		latency_grid, scale_grid = np.meshgrid(latency_range, scale_range)
		X_dense = np.c_[latency_grid.ravel(), scale_grid.ravel()]
		X_dense = np.round(X_dense, 3)
		X_dense_poly = poly.transform(X_dense)

		Y_pred_dense = model.predict(X_dense_poly)

		dense_df = pd.DataFrame({
				'latency': X_dense[:, 0].flatten(),
				'scale': X_dense[:, 1].flatten(),
				'Y_pred_dense': Y_pred_dense.flatten()
			})
		
		optimal_scale_dense = dense_df.loc[dense_df.groupby('latency')['Y_pred_dense'].idxmax()][['latency', 'scale']]
		optimal_scale_ref = data.loc[data.groupby('latency')['performance'].idxmax()][['latency', 'scale']]
		optimal_scale_pred = data.loc[data.groupby('latency')['Y_pred'].idxmax()][['latency', 'scale']]
		print(optimal_scale_ref)
		print(optimal_scale_pred)
		print(optimal_scale_dense)

		# Merge the results on 'latency'
		merged_ref_pred = pd.merge(optimal_scale_ref, optimal_scale_pred, 
							on='latency', suffixes=('_ref', '_pred'))
		
		merged_ref_dense = pd.merge(optimal_scale_ref, optimal_scale_dense, 
							on='latency', suffixes=('_ref', '_dense'))
		

		# Count the number of matches
		matches = (merged_ref_pred['scale_ref'] == merged_ref_pred['scale_pred']).sum()
		scale_error = np.abs(merged_ref_dense['scale_ref'] - merged_ref_dense['scale_dense']).mean()

		optimal_match_rate.append(matches / 4)
		optimal_scale_errors.append(scale_error)

