## Performing basic exploratory data analysis

In [3]:
import pandas as pd

df = pd.read_csv("../data/hydrogen_prices.csv")
print(df.head())
print("shape: ", df.shape)
print("columns: ", df.columns)
print(df.info())
print(df.describe())


         date  energy_cost  gov_policy_score  demand_index  hydrogen_price
0  2016-01-03      54.9671                 9        1.0435         43.1864
1  2016-01-10      48.6174                 1        1.3892         54.6180
2  2016-01-17      56.4769                10        1.1383         43.6238
3  2016-01-24      65.2303                 7        1.2572         58.0046
4  2016-01-31      47.6585                10        1.1610         38.6626
shape:  (500, 5)
columns:  Index(['date', 'energy_cost', 'gov_policy_score', 'demand_index',
       'hydrogen_price'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              500 non-null    object 
 1   energy_cost       500 non-null    float64
 2   gov_policy_score  500 non-null    int64  
 3   demand_index      500 non-null    float64
 4   hydrogen_price    

## Data preprocessing

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load and preprocess data
def load_data(path):
    df = pd.read_csv(path)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    X = df.drop(columns=["date", "hydrogen_price"])
    y = df["hydrogen_price"]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, scaler

# Train and evaluate multiple models
def evaluate_models(X_train, y_train, X_test, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
    }

    results = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        rmse = mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        results.append({
            "Model": name,
            "MAE": mae,
            "RMSE": rmse,
            "R2 Score": r2
        })
    return pd.DataFrame(results).sort_values(by="RMSE")


In [5]:
X, y, scaler = load_data("../data/hydrogen_prices.csv")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

results_df = evaluate_models(X_train, y_train, X_test, y_test)
print("\nModel Comparison Results:\n")
print(results_df.to_string(index=False))


Model Comparison Results:

            Model      MAE      RMSE  R2 Score
Linear Regression 1.564806  4.086353  0.912696
    Random Forest 1.925579  5.931477  0.873275
    Decision Tree 2.413364 11.561975  0.752980


### Model Analysis

- Linear Regression performed the best overall with the lowest MAE and RMSE, and the highest R² score. This indicates that a simple linear model is effective at capturing the relationships in the data.
- Random Forest offered reasonable performance, but slightly underperformed compared to linear regression. It may be overfitting or not benefiting as much from the dataset size or feature simplicity.
- Decision Tree performed the worst, showing signs of overfitting (low bias, high variance), which is reflected in the high RMSE and low R2 score.

#### Conclusion
Linear Regression was chosen for deployment due to its strong performance, simplicity, and interpretability.
