# Generate Data

In [41]:
# Re-run everything after code execution environment reset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Create all data
n_train = 150
n_eval = 75
X_all = np.random.normal(0, 10, (n_train+n_eval, 3))

# Separate training and evaluation data
X_train, X_eval = train_test_split(X_all, test_size=1/3, random_state=42)

# New target function: nonlinear (sin and exp) with more noise
y_train = (
    np.sin(X_train[:, 0]) +
    np.exp(-X_train[:, 1]**2) +
    0.5 * X_train[:, 2] +
    np.random.normal(0, 10, n_train)  # increased noise
)

train_df = pd.DataFrame(X_train, columns=['feature_1', 'feature_2', 'feature_3'])
train_df['target'] = y_train
train_df.to_csv("train_task1.csv", index=False)

# Create new evaluation data
y_eval = (
    np.sin(X_eval[:, 0]) +
    np.exp(-X_eval[:, 1]**2) +
    0.5 * X_eval[:, 2]
)

eval_df = pd.DataFrame(X_eval, columns=['feature_1', 'feature_2', 'feature_3'])
eval_df.insert(0, 'id', np.arange(1, n_eval + 1))
eval_df.to_csv("eval_task1.csv", index=False)

# Save ground-truth targets for evaluation
true_eval_df = pd.DataFrame({
    'id': np.arange(1, n_eval + 1),
    'eval_target': y_eval
})
true_eval_df.to_csv("eval_true_targets_task1.csv", index=False)

# Evaluate Data

In [42]:
import pandas as pd
from sklearn.metrics import mean_squared_error

# Load predictions from the students and the ground truth
pred_df = pd.read_csv("predictions_task1.csv")  # submitted file
eval_df = pd.read_csv("eval_true_targets_task1.csv")  # private ground truth

# Merge both dataframes on the 'id' column to align rows
merged_df = pd.merge(pred_df, eval_df, on='id')

# Compute Mean Squared Error
mse = mean_squared_error(merged_df['eval_target'], merged_df['prediction'])
print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 1.9821
