# Imports

In [2]:
import pandas as pd
import glob
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, f1_score, mean_squared_error
from sklearn.metrics import classification_report

# Load data

In [3]:
datapath = "../data/prediction_test/bg_distributions/separated_dists"
files_m1_ex1 = sorted(glob.glob(f"{datapath}/perf_data_program_*_opencl.csv"))

In [4]:
all_data = []

In [5]:
for f1 in files_m1_ex1:
    # file2 will be exactly file1 with another machine name
    f2 = f1.replace("opencl", "gogh")
    
    df_m1 = pd.read_csv(f1)
    df_m2 = pd.read_csv(f2)
    
    # Merge csv's in one df
    df = df_m1.merge(df_m2, on=["program", "run"], suffixes=("_m1", "_m2"))
    all_data.append(df)
    
# Merge all df's
df_all = pd.concat(all_data, ignore_index=True)

In [6]:
df_all.head()

Unnamed: 0,program,run,cpu-cycles_m1,instructions_m1,cache-references_m1,cache-misses_m1,duration_time_m1,cpu-cycles_m2,instructions_m2,cache-references_m2,cache-misses_m2,duration_time_m2
0,program_10,1,5446775489,8648349229,46163,8136,1550263802,1334096,1009851,94132,25379,734626
1,program_10,2,5322950843,8648416958,40260,6775,1510210810,1322786,1021682,94398,26493,707144
2,program_10,3,5373986632,8648357574,39263,7103,1509293975,1294091,1012946,91863,24226,701494
3,program_10,4,5199240656,8648223138,38553,9610,1462210298,1306165,1019565,93667,24605,731299
4,program_10,5,5263733578,8648291000,38819,7208,1476810922,1313125,1015511,93751,24595,710681


# Models

## Model 1 (sklearn LinearRegression + multiple targets)

In [24]:
# Features
X = df_all[["cpu-cycles_m1", "instructions_m1", "cache-references_m1", "cache-misses_m1", "duration_time_m1"]]

# Targets
Y = df_all[["cpu-cycles_m2", "instructions_m2", "cache-references_m2", "cache-misses_m2", "duration_time_m2"]]

In [25]:
# Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
# Split train and test -> TODO: Select test by hand to better measurement
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)


In [27]:
# Multi-output linear regression
model = LinearRegression()
model.fit(X_train, Y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [28]:
# Prediction
Y_pred = model.predict(X_test)

In [29]:
# Output
for i, col in enumerate(Y.columns):
    r2 = r2_score(Y_test.iloc[:, i], Y_pred[:, i])
    mse = mean_squared_error(Y_test.iloc[:, i], Y_pred[:, i])
    
    print(f"{col}: R²={r2:.3f}, MSE={mse:.3f}")

cpu-cycles_m2: R²=-0.010, MSE=667663750.909
instructions_m2: R²=-0.038, MSE=52485455.986
cache-references_m2: R²=-0.001, MSE=4463077.556
cache-misses_m2: R²=0.006, MSE=1621465.161
duration_time_m2: R²=-0.045, MSE=599845131.114


## Model 2 (OLS + One to One)

In [30]:
X = df_all['cpu-cycles_m1']
Y = df_all['cpu-cycles_m2']

In [31]:
# Reshape
X_reshaped = X.values.reshape(-1, 1)

# Separate train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_reshaped, Y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [32]:
r2 = r2_score(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)

In [33]:
print(r2)

-0.01773398300861222


In [34]:
print(mse)

265215776.14542943


In [1]:
import statsmodels.api as sm

In [17]:
pairs = [
    ('cpu-cycles_m1', 'cpu-cycles_m2'),
    ('cache-misses_m1', 'cache-misses_m2'),
    ('instructions_m1', 'instructions_m2'),
    ('cache-references_m1', 'cache-references_m2'),
    ('duration_time_m1', 'duration_time_m2')
]
for xcol, ycol in pairs:
    print(f"REGRESSION: {xcol} -> {ycol}")
    Y = df_all[ycol]
    X = df_all[xcol]
    X = sm.add_constant(X)
    model = sm.OLS(Y, X)
    results = model.fit()

    # Results
    print(results.summary())
    print("\n" * 2)

REGRESSION: cpu-cycles_m1 -> cpu-cycles_m2
                            OLS Regression Results                            
Dep. Variable:          cpu-cycles_m2   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.437
Date:                Sun, 14 Sep 2025   Prob (F-statistic):              0.231
Time:                        23:52:57   Log-Likelihood:                -11697.
No. Observations:                1000   AIC:                         2.340e+04
Df Residuals:                     998   BIC:                         2.341e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
con