In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
import time

In [2]:
data = pd.read_csv("dfdataWeek11.csv")
X = data.drop('outcome', axis=1)
y = data['outcome']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Results storage
results = []

# Different dataset sizes
sizes = [1000, 10000, 100000]

# Different configurations
configs = [
    {"name": "1 hidden layer 4 nodes", "hidden_layer_sizes": (4,)},
    {"name": "2 hidden layers of 4 nodes each", "hidden_layer_sizes": (4, 4)}
]

In [3]:
for size in sizes:
    # Sample the data
    if size > len(X_scaled):
        # If requested size is larger, use sampling with replacement
        indices = np.random.choice(len(X_scaled), size, replace=True)
    else:
        indices = np.random.choice(len(X_scaled), size, replace=False)

    X_sample = X_scaled[indices]
    y_sample = y.iloc[indices]

    # Split into train/validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_sample, y_sample, test_size=0.2, random_state=42
    )

    for config in configs:
        # Train model
        start_time = time.time()

        model = MLPClassifier(
            hidden_layer_sizes=config["hidden_layer_sizes"],
            max_iter=1000,
            random_state=42
        )

        model.fit(X_train, y_train)

        # Predict
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # Calculate errors
        train_error = mean_squared_error(y_train, y_train_pred)
        val_error = mean_squared_error(y_val, y_val_pred)

        execution_time = time.time() - start_time

        # Store results
        results.append({
            'Data size': size,
            'Configuration': config["name"],
            'Training error': train_error,
            'Validation error': val_error,
            'Time of execution': execution_time
        })

# Create results dataframe
results_df = pd.DataFrame(results)
print(results_df)

   Data size                    Configuration  Training error  \
0       1000           1 hidden layer 4 nodes        0.005000   
1       1000  2 hidden layers of 4 nodes each        0.002500   
2      10000           1 hidden layer 4 nodes        0.001000   
3      10000  2 hidden layers of 4 nodes each        0.000625   
4     100000           1 hidden layer 4 nodes        0.001112   
5     100000  2 hidden layers of 4 nodes each        0.000850   

   Validation error  Time of execution  
0           0.03000           3.798759  
1           0.01000           3.177382  
2           0.00200           4.109497  
3           0.00150           5.327000  
4           0.00075           9.052965  
5           0.00070          10.062551  
