### Imports

In [3]:
import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score



### Paths Setup

In [None]:
# Set file paths according to your directory structure

processed_data_file_path = "../data/processed/exp_materials_with_jarvis_and_magpie.csv"

# If you want to use pretrained model
use_pretrained_model = True
checkpoints = "../models/random_forest_on_exp_data.pkl"

# If you want to train your own model
save_model = True 
model_name = "random_forest.pkl" 

### Load Processed Data

In [None]:
if os.path.exists(processed_data_file_path):
    comp_df = pd.read_csv(processed_data_file_path)
    print(f"Input data of shape {comp_df.shape}, loaded from: {processed_data_file_path}")
else:
    print("Invalid Path, directory doesn't exists.")

### Remove Non numerical columns

In [None]:
non_numeric_columns = comp_df.select_dtypes(exclude=['int64', 'float64']).columns.tolist()
non_numeric_columns

In [None]:
final_df = comp_df.drop(columns=non_numeric_columns, axis=1)
final_df.shape

In [None]:
final_df.reset_index(drop=True, inplace=True)

### Data Splitting

In [None]:
# Select 'band_gap' as the target label
target_label = final_df['band_gap']

# Select the remaining columns as features
features = final_df.drop(columns=['band_gap'])

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, target_label, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### Model

In [None]:
if use_pretrained_model:
   with open(checkpoints, 'rb') as file:
    rf_regressor = pickle.load(file)
else:
   rf_regressor = RandomForestRegressor(random_state=42)
   rf_regressor.fit(X_train, y_train)

   if save_model:
      with open(model_name, 'wb') as file:
         pickle.dump(rf_regressor, file)

### Prediction

In [None]:
y_pred = rf_regressor.predict(X_test)

### Evaluation

In [None]:
# Calculate RMSE 

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate R2 Square
r2 = r2_score(y_test, y_pred)
print("R2 Score(R2):", r2)