In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap

# Set the input file path
file_path = r'F:\python\shading\combined_data.xlsx'

# Check if the input file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Get the Excel file name without extension
excel_file_name = os.path.splitext(os.path.basename(file_path))[0]
print("File name:", excel_file_name)

# Read data from Excel
data = pd.read_excel(file_path)

# Rename columns
data.rename(columns={
    'direction': 'WFO',
    'num': 'SDN',
    'width': 'SDD',
    'angel': 'SDA',
    'open': 'OWR',
    'eui': 'EUI',
    'da': 'sDA',
    'udi': 'UDI',
    'tcp': 'TCP'
}, inplace=True)

# Add a new column 'WWR'
data['WWR'] = data['height'] * 7.4 / 8.4 / 3

# Set font for plots
plt.rcParams["font.family"] = "Times New Roman"

# Define the output folder path
output_folder = r'F:\python\shading\drawings\influencing\test'

# Make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Define design and target variables
design_variables = ['WFO', 'SDN', 'SDD', 'SDA', 'OWR', 'WWR']
target_variables = ['UDI', 'EUI', 'TCP', 'sDA']

# Extract and scale the design variables
X = data[design_variables]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=design_variables)

# Dictionary to store R^2 scores
r2_scores = {'Target Variable': [], 'R^2 Score': []}

# Loop through each target variable
for target in target_variables:
    print(f"\nTraining model for target variable: {target}")
    
    # Extract target variable
    y = data[target]
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Train a Random Forest model
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Predict and evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"MSE for {target}: {mse}")
    print(f"R^2 score for {target}: {r2}")
    
    # Save R^2 score
    r2_scores['Target Variable'].append(target)
    r2_scores['R^2 Score'].append(r2)
    
    # Plot feature importance
    feature_importance = model.feature_importances_
    plt.figure(figsize=(10, 8))
    plt.barh(design_variables, feature_importance)
    plt.xlabel("Feature Importance")
    plt.title(f"Feature Importance in Random Forest Model for {target}")
    plt.tight_layout()

    # Add black border to plots
    ax = plt.gca()
    for position, spine in ax.spines.items():
        spine.set_edgecolor('black')
        spine.set_linewidth(1.5)

    plt.savefig(os.path.join(output_folder, f"{target}_feature_importance.png"))
    plt.close()
    
    # SHAP value analysis
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Inverse transform to original feature scale
    X_test_original = pd.DataFrame(scaler.inverse_transform(X_test), columns=design_variables)

    # SHAP dot summary plot
    shap.summary_plot(shap_values, X_test_original, feature_names=design_variables, plot_type="dot", show=False)
    plt.title(f"SHAP Dot Summary Plot for {target}")
    plt.tight_layout()

    ax = plt.gca()
    x_ticks = ax.get_xticks()
    for x in x_ticks:
        plt.axvline(x=x, color='grey', linestyle='--', linewidth=0.8)
    for position, spine in ax.spines.items():
        spine.set_edgecolor('black')
        spine.set_linewidth(1.5)

    plt.savefig(os.path.join(output_folder, f"{excel_file_name}_{target}_shap_dot_summary.png"), dpi=300)
    plt.close()
    
    # SHAP bar summary plot
    shap.summary_plot(shap_values, X_test_original, feature_names=design_variables, plot_type="bar", show=False)
    plt.title(f"SHAP Bar Summary Plot for {target}")
    plt.tight_layout()

    ax = plt.gca()
    x_ticks = ax.get_xticks()
    for x in x_ticks:
        plt.axvline(x=x, color='grey', linestyle='--', linewidth=0.8)
    for position, spine in ax.spines.items():
        spine.set_edgecolor('black')
        spine.set_linewidth(1.5)

    plt.savefig(os.path.join(output_folder, f"{excel_file_name}_{target}_shap_bar_summary.png"), dpi=300)
    plt.close()
    
    # SHAP dependence plots for each feature
    for feature in design_variables:
        plt.figure()
        shap_values_for_feature = shap_values[:, design_variables.index(feature)]
        target_values = y_test  # Color mapping based on actual target values

        scatter = plt.scatter(
            X_test_original[feature], 
            shap_values_for_feature, 
            c=target_values, 
            cmap="coolwarm",
            s=20,
            edgecolor='k',
            alpha=0.7
        )

        cbar = plt.colorbar(scatter)
        cbar.set_label(f"{target}")

        plt.xlabel(feature)
        plt.ylabel(f"SHAP value for {feature}")
        plt.title(f"SHAP Dependence Plot for {feature} - Target: {target}")
        plt.tight_layout()
        plt.subplots_adjust(top=0.85, right=0.9)

        ax = plt.gca()
        for position, spine in ax.spines.items():
            spine.set_edgecolor('black')
            spine.set_linewidth(1.5)

        plt.savefig(os.path.join(output_folder, f"{excel_file_name}_{target}_shap_dependence_{feature}.png"), dpi=300)
        plt.close()

# Save R^2 scores to an Excel file
r2_df = pd.DataFrame(r2_scores)
r2_df.to_excel(os.path.join(output_folder, f"{excel_file_name}_R2_scores.xlsx"), index=False)
print("R^2 scores have been saved to Excel.")


File name: combined_data

Training model for target variable: UDI
MSE for UDI: 0.5889122057211379
R^2 score for UDI: 0.9889520211272163

Training model for target variable: EUI
MSE for EUI: 0.8315416515814252
R^2 score for EUI: 0.9845818637498736

Training model for target variable: TCP
MSE for TCP: 0.2584192138534426
R^2 score for TCP: 0.9832425967249184

Training model for target variable: sDA
MSE for sDA: 10.577216364335321
R^2 score for sDA: 0.9842594972706269
R^2 scores have been saved to Excel.
