In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
from joblib import Parallel, delayed

### Paths Setup

In [2]:
# Set file paths according to your directory structure

train_data_file_path = "../data/processed/train_dataset_m2.csv"
test_data_file_path = "../data/processed/test_dataset_m2.csv"

### Load Training Data

In [16]:
if os.path.exists(train_data_file_path):
    train_data = pd.read_csv(train_data_file_path)
    print(f"Input data of shape {train_data.shape}, loaded from: {train_data_file_path}")
else:
    print("Invalid Path, directory doesn't exists.")

Input data of shape (835, 1702), loaded from: ../data/processed/train_dataset_m2.csv


In [17]:
if os.path.exists(test_data_file_path):
    test_data = pd.read_csv(test_data_file_path)
    print(f"Input data of shape {test_data.shape}, loaded from: {test_data_file_path}")
else:
    print("Invalid Path, directory doesn't exists.")

Input data of shape (188, 1702), loaded from: ../data/processed/test_dataset_m2.csv


In [18]:
results_df = test_data[['composition', 'formation_energy_per_atom']]
results_df.shape

(188, 2)

In [19]:
train_data = train_data.drop('composition', axis=1)
test_data = test_data.drop('composition', axis=1)

train_data.shape, test_data.shape

((835, 1701), (188, 1701))

In [20]:
target_column = 'formation_energy_per_atom'

X_train = train_data.drop(target_column, axis=1)
y_train = train_data[target_column]

X_test = test_data.drop(target_column, axis=1)
y_test = test_data[target_column]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((835, 1700), (835,), (188, 1700), (188,))

In [21]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
gbr = GradientBoostingRegressor()

In [23]:
gbr.fit(X_train_scaled, y_train)

In [24]:
y_pred = gbr.predict(X_test_scaled)

In [25]:
mean_squared_error(y_test, y_pred, squared=False)

0.18059590056703434

In [26]:
results_df['formation_energy_predicted'] = y_pred

In [28]:
results_df.rename(columns={'formation_energy_per_atom': 'formation_energy (actuall)'}, inplace=True)
results_df.rename(columns={'formation_energy_predicted': 'formation_energy (predicted)'}, inplace=True)


In [29]:
results_df.head(3)

Unnamed: 0,composition,formation_energy (actuall),formation_energy (predicted)
0,Pr2Se3,-2.168908,-2.213773
1,Ru2Ge3,-0.352755,-0.320662
2,Mg5B3O9F,-3.111898,-2.889308


In [30]:
results_df.to_csv('../outputs/deliverable_12_nov_23/m2_overall_results_on_formation_energy.csv', index=False)

### Models Comparison

In [9]:
# Initialize regressor models
models = [
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Linear Regression', LinearRegression()),
    ('Lasso Regression', Lasso()),
    ('Ridge Regression', Ridge()),
    ('Extra Trees', ExtraTreesRegressor()),
    ('LGBM Regressor', LGBMRegressor()),
    ('AdaBoost Regressor', AdaBoostRegressor())
]

In [10]:
def train_evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)  # Train the model
    predictions = model.predict(X_test)  # Make predictions
    
    # Calculate metrics
    rmse = mean_squared_error(y_test, predictions, squared=False)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # Calculate MAD as mean of absolute residuals
    mad_mean = np.mean(np.abs(y_test - predictions))
    
    # Calculate MAD as median of absolute residuals
    residuals = y_test - predictions
    mad_median = np.median(np.abs(residuals - np.median(residuals)))
    
    return [name, rmse, mae, r2, mad_mean, mad_median]

# Parallel processing of models with progress bar
results = Parallel(n_jobs=-1, backend='loky')(delayed(train_evaluate_model)(name, model, X_train_scaled, X_test_scaled, y_train, y_test) for name, model in tqdm(models))

# Create a DataFrame for results
results_df = pd.DataFrame(results, columns=['Model', 'RMSE', 'MAE', 'R^2 Score', 'MAD (Mean)', 'MAD (Median)'])


100%|█████████████████████████████████████████████| 8/8 [00:01<00:00,  5.48it/s]


In [11]:
# band_gap

sorted_results_df = results_df.sort_values(by='RMSE', ascending=True)
sorted_results_df

Unnamed: 0,Model,RMSE,MAE,R^2 Score,MAD (Mean),MAD (Median)
1,Gradient Boosting,0.181282,0.121027,0.957929,0.121027,0.076989
6,LGBM Regressor,0.18594,0.123236,0.95574,0.123236,0.068759
5,Extra Trees,0.192667,0.1248,0.952479,0.1248,0.072815
4,Ridge Regression,0.231345,0.14996,0.931485,0.14996,0.10565
0,Random Forest,0.243662,0.153291,0.923995,0.153291,0.083528
7,AdaBoost Regressor,0.279696,0.209272,0.899852,0.209272,0.154867
3,Lasso Regression,0.918785,0.761377,-0.080678,0.761377,0.368158
2,Linear Regression,1.034181,0.525409,-0.369184,0.525409,0.259709


In [12]:
sorted_results_df.to_csv('../outputs/models_results_on_formation_energy_per_atom.csv', index=False)