In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pymatgen.ext.matproj import MPRester
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# ~~~~~~~~ Initializing Materials Project API key ~~~~~~~~
mpr = MPRester("v8neemxX7XcePFg4rBdjyF9yJ0fHYbAV")

# ~~~~~~~~ Search for materials (Gallium) ~~~~~~~~
materialUsed = mpr.materials.summary.search(elements=["Ga"])

# ~~~~~~~~Checks to see if the materials were selected ~~~~~~~~
if len(materialUsed) == 0:
    print("No materials found for Gallium.")
else:
    print(f"Number of materials found for Gallium: {len(materialUsed)}")

    # ~~~~~~~~ Convert into a Dataframe for us to use ~~~~~~~~
    dataFrame = pd.DataFrame([{
        'Material ID': material.material_id,
        'Density': getattr(material, 'density', 'N/A'),
        'Band Gap': getattr(material, 'band_gap', 'N/A'),
        'Switching Time': getattr(material, 'switching_time', 'N/A'),
        'Leakage Current': getattr(material, 'leakage_current', 'N/A'),
        'Power Consumption': getattr(material, 'power_consumption', 'N/A')
    } for material in materialUsed])

    # ~~~~~~~~ Known values for Gallium (hypothetical values) ~~~~~~~~
    known_switching_time = 2.5  # ms
    known_leakage_current = 0.3  # µA
    known_power_consumption = 15  # W

    dataFrame['Switching Time'] = pd.to_numeric(dataFrame['Switching Time'], errors='coerce')
    dataFrame['Leakage Current'] = pd.to_numeric(dataFrame['Leakage Current'], errors='coerce')
    dataFrame['Power Consumption'] = pd.to_numeric(dataFrame['Power Consumption'], errors='coerce')

    # ~~~~~~~~ Filling in missing values with known values for Gallium ~~~~~~~~
    dataFrame['Switching Time'].fillna(known_switching_time, inplace=True)
    dataFrame['Leakage Current'].fillna(known_leakage_current, inplace=True)
    dataFrame['Power Consumption'].fillna(known_power_consumption, inplace=True)

    # ~~~~~~~~ Remove Missing parameters ~~~~~~~~
    dataFrame = dataFrame.dropna(subset=['Density', 'Band Gap'])

    # ~~~~~~~~ Confirm that we have the data for Gallium ~~~~~~~~
    if dataFrame.empty:
        print("No data available for Gallium after cleaning.")
    else:
        print("Data found for Gallium. Proceeding with predictions.")
        print(dataFrame.head())

        # ~~~~~~~~ Feature Engineering - Applying log transformation to the features ~~~~~~~~
        dataFrame['log_Density'] = np.log1p(dataFrame['Density'])  # ~~~~~~~~ log(1 + Density) to avoid log(0) issues ~~~~~~~~
        dataFrame['log_Leakage Current'] = np.log1p(dataFrame['Leakage Current'])  # ~~~~~~~~ log(1 + Leakage Current) ~~~~~~~~ 
        dataFrame['log_Power Consumption'] = np.log1p(dataFrame['Power Consumption'])  # ~~~~~~~~ log(1 + Power Consumption) ~~~~~~~~

         #~~~~~~~~ Preparing for ML ~~~~~~~
        X = dataFrame[['log_Density', 'Switching Time', 'log_Leakage Current', 'log_Power Consumption']]
        y = dataFrame['Band Gap']

        # ~~~~~~~~ Scaling the Data ~~~~~~~~
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        # ~~~~~~~~ Random Forest Regressor ~~~~~~~~
        RFRmodel = RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_split=5, random_state=42)
        RFRmodel.fit(X_train, y_train)
        y_pred = RFRmodel.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # ~~~~~~~~ Performance Scoring ~~~~~~~~
        print(f"Mean Squared Error (MSE): {mse}")
        print(f"R-squared (R²) Score: {r2}")

        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, color='blue', alpha=0.7, label='Predicted vs Actual')
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Ideal Fit')
        plt.xlabel("Actual Band Gap")
        plt.ylabel("Predicted Band Gap")
        plt.title("Scatter Plot: Actual vs Predicted Band Gap")
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.show()

        # ~~~~~~~~ Feature Importance Graph ~~~~~~~~
        feature_importance = RFRmodel.feature_importances_
        feature_names = ['log_Density', 'Switching Time', 'log_Leakage Current', 'log_Power Consumption']

        plt.figure(figsize=(10, 6))
        plt.barh(feature_names, feature_importance, color='skyblue')
        plt.xlabel("Feature Importance")
        plt.title("Feature Importance in Band Gap Prediction")
        plt.show()
