In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

data_path = "data_processed.csv"
df = pd.read_csv(data_path)

# Preparing the dataset
X = df.drop(["Sub_metering_1", "Sub_metering_2", "Sub_metering_3"], axis=1)
y = df["Sub_metering_1"] + df["Sub_metering_2"] + df["Sub_metering_3"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models and scalers
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=10)
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Scaling the datasets
X_train_minmax = minmax_scaler.fit_transform(X_train)
X_test_minmax = minmax_scaler.transform(X_test)
X_train_standard = standard_scaler.fit_transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

# Timing training and prediction for different models and scalers
models = [lr, rf]
scalers = [minmax_scaler, standard_scaler]

for model in models:
    for scaler in scalers:
        print(f"Model: {model.__class__.__name__}, Scaler: {scaler.__class__.__name__}")
        start_train = time.time()
        model.fit(scaler.transform(X_train), y_train)
        end_train = time.time()

        start_pred = time.time()
        y_pred = model.predict(scaler.transform(X_test))
        end_pred = time.time()

        print(f"Training Time: {end_train - start_train:.4f} seconds")
        print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
        print(f"MSE: {mean_squared_error(y_test, y_pred)}")
        print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
        print(f"R2: {r2_score(y_test, y_pred)}")
        print("\n")

print("\n")
print("Without Scaling Full Data")
# Without scaling
for model in models:
    print(f"Model: {model.__class__.__name__}")
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()

    start_pred = time.time()
    y_pred = model.predict(X_test)
    end_pred = time.time()

    print(f"Training Time: {end_train - start_train:.4f} seconds")
    print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"R2: {r2_score(y_test, y_pred)}")
    print("\n")

# Dimensionality reduction: Truncated SVD
svd = TruncatedSVD(n_components=5)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

print("\n")
print("With SVD")
for model in models:
    print(f"Model: {model.__class__.__name__} with SVD")
    start_train = time.time()
    model.fit(X_train_svd, y_train)
    end_train = time.time()

    start_pred = time.time()
    y_pred = model.predict(X_test_svd)
    end_pred = time.time()

    print(f"Training Time: {end_train - start_train:.4f} seconds")
    print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"R2: {r2_score(y_test, y_pred)}")
    print("\n")

# Dimensionality reduction: Gaussian Random Projection
random_projection = GaussianRandomProjection(n_components=5)
X_train_jl = random_projection.fit_transform(X_train)
X_test_jl = random_projection.transform(X_test)

print("\n")
print("With JL Lemma")
for model in models:
    print(f"Model: {model.__class__.__name__} with GaussianRandomProjection")
    start_train = time.time()
    model.fit(X_train_jl, y_train)
    end_train = time.time()

    start_pred = time.time()
    y_pred = model.predict(X_test_jl)
    end_pred = time.time()

    print(f"Training Time: {end_train - start_train:.4f} seconds")
    print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"R2: {r2_score(y_test, y_pred)}")
    print("\n")

# Subset of data
X_train_subset, X_test, y_train_subset, y_test = train_test_split(X, y, test_size=0.85, random_state=42)

print("\n")
print("Subset of Data")
for model in models:
    for scaler in scalers:
        print(f"Model: {model.__class__.__name__}, Scaler: {scaler.__class__.__name__} with Subset")
        start_train = time.time()
        model.fit(scaler.transform(X_train_subset), y_train_subset)
        end_train = time.time()

        start_pred = time.time()
        y_pred = model.predict(scaler.transform(X_test))
        end_pred = time.time()

        print(f"Training Time: {end_train - start_train:.4f} seconds")
        print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
        print(f"MSE: {mean_squared_error(y_test, y_pred)}")
        print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
        print(f"R2: {r2_score(y_test, y_pred)}")
        print("\n")

# Without scaling on subset
print("\n")
print("Without Scaling on Subset")
for model in models:
    print(f"Model: {model.__class__.__name__} with Subset")
    start_train = time.time()
    model.fit(X_train_subset, y_train_subset)
    end_train = time.time()

    start_pred = time.time()
    y_pred = model.predict(X_test)
    end_pred = time.time()

    print(f"Training Time: {end_train - start_train:.4f} seconds")
    print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"R2: {r2_score(y_test, y_pred)}")
    print("\n")


Model: LinearRegression, Scaler: MinMaxScaler
Training Time: 0.5157 seconds
Prediction Time: 0.0297 seconds
MSE: 28.506560062833056
MAE: 3.8157741815288513
R2: 0.608606233286727


Model: LinearRegression, Scaler: StandardScaler
Training Time: 0.5806 seconds
Prediction Time: 0.0482 seconds
MSE: 28.506560062833067
MAE: 3.815774181528852
R2: 0.608606233286727


Model: RandomForestRegressor, Scaler: MinMaxScaler
Training Time: 100.4726 seconds
Prediction Time: 2.8696 seconds
MSE: 3.3870995125648498
MAE: 0.8344701592985116
R2: 0.953495278506652


Model: RandomForestRegressor, Scaler: StandardScaler
Training Time: 104.6572 seconds
Prediction Time: 3.2690 seconds
MSE: 3.380112253832181
MAE: 0.8360453719796662
R2: 0.9535912132496849




Without Scaling Full Data
Model: LinearRegression
Training Time: 0.4315 seconds
Prediction Time: 0.0234 seconds
MSE: 28.5065600628331
MAE: 3.815774181528856
R2: 0.6086062332867265


Model: RandomForestRegressor
Training Time: 103.5325 seconds
Prediction Time: 2

In [3]:
# Dimensionality reduction: Gaussian Random Projection on subset of data
random_projection = GaussianRandomProjection(n_components=5)
X_train_subset_jl = random_projection.fit_transform(X_train_subset)
X_test_jl = random_projection.transform(X_test)

for model in models:
    print(f"Model: {model.__class__.__name__} with GaussianRandomProjection on Subset")
    start_train = time.time()
    model.fit(X_train_subset_jl, y_train_subset)
    end_train = time.time()

    start_pred = time.time()
    y_pred = model.predict(X_test_jl)
    end_pred = time.time()

    print(f"Training Time: {end_train - start_train:.4f} seconds")
    print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"R2: {r2_score(y_test, y_pred)}")
    print("\n")


Model: LinearRegression with GaussianRandomProjection on Subset
Training Time: 0.0654 seconds
Prediction Time: 0.0220 seconds
MSE: 33.74026755290015
MAE: 4.2814859686379805
R2: 0.5363350006437868


Model: RandomForestRegressor with GaussianRandomProjection on Subset
Training Time: 30.4056 seconds
Prediction Time: 7.5581 seconds
MSE: 21.566802721088433
MAE: 2.6351582976347285
R2: 0.7036250067042069




In [4]:
# Dimensionality reduction: Truncated SVD on subset of data
svd = TruncatedSVD(n_components=5)
X_train_subset_svd = svd.fit_transform(X_train_subset)
X_test_svd = svd.transform(X_test)

for model in models:
    print(f"Model: {model.__class__.__name__} with TruncatedSVD on Subset")
    start_train = time.time()
    model.fit(X_train_subset_svd, y_train_subset)
    end_train = time.time()

    start_pred = time.time()
    y_pred = model.predict(X_test_svd)
    end_pred = time.time()

    print(f"Training Time: {end_train - start_train:.4f} seconds")
    print(f"Prediction Time: {end_pred - start_pred:.4f} seconds")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"R2: {r2_score(y_test, y_pred)}")
    print("\n")


Model: LinearRegression with TruncatedSVD on Subset
Training Time: 0.0400 seconds
Prediction Time: 0.0160 seconds
MSE: 38.110678966209306
MAE: 4.808858432023668
R2: 0.47627599838598755


Model: RandomForestRegressor with TruncatedSVD on Subset
Training Time: 32.0007 seconds
Prediction Time: 8.4972 seconds
MSE: 15.216249032970623
MAE: 2.120463509938419
R2: 0.7908954904695215


