In [1]:
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import os
from sklearn.linear_model import RidgeCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import scipy.stats as stats
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
# load data

#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Total_Phosphorus/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Total_Nitrogen/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Nitrate_Nitrogen/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Dissolved_Phosphorus/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Particulate_Phosphorus/'
prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Ammoniacal_Nitrogen/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_leaching_sample_30cm/'

file_list = [f for f in os.listdir(prefex) if f.endswith(".csv")]
print(file_list)
    

['Water_sample_runoff_citrus_Particulate_Phosphorus.csv', 'Water_sample_runoff_corn_Particulate_Phosphorus.csv', 'Water_sample_runoff_rice_Particulate_Phosphorus.csv', 'Water_sample_runoff_vegetable_Particulate_Phosphorus.csv']


In [3]:
def Multiple_Linear_Regression_Combined(files, analysistype, prefex):
    """Performs multiple linear regression for multiple files and saves all plots in one large figure."""
    
    dependent_columns = ['NPK', 'PK', 'NK', 'CK', 'OF']
    num_files = len(files)
    num_vars = len(dependent_columns)

    # Create a large figure (rows = files, columns = dependent variables)
    fig, axes = plt.subplots(nrows=num_files, ncols=num_vars, figsize=(8 * num_vars, 8 * num_files))

    results_list = []

    for file_idx, file_name in enumerate(files):
        # Load dataset
        data = pd.read_csv(prefex + file_name, encoding='latin1')

        # Define independent variables (X)
        X = data[data.columns[6:]]
        X = X.drop(X.columns[1], axis=1)
        X = sm.add_constant(X)

        for var_idx, column in enumerate(dependent_columns):
            if column not in data:
                continue  # Skip if column is missing

            y = data[column].dropna()
            X_filtered, y_filtered = X.align(y, join="inner", axis=0)

            # Split the data
            X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42)

            # Fit OLS model
            model = sm.OLS(y_train, X_train).fit()
            R2 = round(model.rsquared, 3)
            
            # Predictions
            y_pred = model.predict(X_test)

            # Calculate Error Metrics
            MAE = round(mean_absolute_error(y_test, y_pred), 3)
            MSE = round(mean_squared_error(y_test, y_pred), 3)
            RMSE = round(np.sqrt(MSE), 3)

            # Store results
            results_list.append({
                'File': file_name,
                'Dependent Variable': column,
                'R-squared': R2,
                'MAE': MAE,
                'MSE': MSE,
                'RMSE': RMSE
            })

            # **Scatter plot with trend line and error metrics**
            ax = axes[file_idx, var_idx] if num_files > 1 else axes[var_idx]  # Handle single file case
            #sns.scatterplot(x=model.fittedvalues, y=y_train, ax=ax, label="Actual Data", s=160, edgecolor='black', facecolor='#7209B7') 
            #sns.scatterplot(x=model.fittedvalues, y=y_train, ax=ax, label="Actual Data", s=160, edgecolor='black', facecolor='#4CC9F0') 
            #sns.scatterplot(x=model.fittedvalues, y=y_train, ax=ax, label="Actual Data", s=160, edgecolor='black', facecolor='#F72585') 
            #sns.scatterplot(x=model.fittedvalues, y=y_train, ax=ax, label="Actual Data", s=160, edgecolor='black', facecolor='#4361EE') 
            #sns.scatterplot(x=model.fittedvalues, y=y_train, ax=ax, label="Actual Data", s=160, edgecolor='black', facecolor='#008000') 
            sns.scatterplot(x=model.fittedvalues, y=y_train, ax=ax, label="Actual Data", s=160, edgecolor='black', facecolor='#FFA500') 
            
            sns.lineplot(x=model.fittedvalues, y=model.fittedvalues, color='red', 
                         ax=ax, label="Trend Line", linewidth=3)

            ax.set_title(f"{column} - {file_name.split('.')[0].split('_')[3]}\n(R²={R2}, MAE={MAE}, MSE={MSE}, RMSE={RMSE})", fontsize=20)
            ax.set_xlabel("Predicted Values", fontsize=18)
            ax.set_ylabel(f"Actual {column}", fontsize=18)
            ax.tick_params(axis='both', labelsize=16)
            ax.legend(fontsize=12)
            ax.grid()

    # Adjust layout
    plt.tight_layout()
    plt.suptitle("Scatter Plots & Trend Lines for " + " " + 
                 prefex.split('/')[-3].split('_')[0] + " " +
                 prefex.split('/')[-3].split('_')[1] + " " +
                 prefex.split('/')[-3].split('_')[2] + " " +
                 '(' +  analysistype + ')',
                 fontsize=28, y=1.02)

    # Save to a single PDF file
    pdf_filename = prefex + "Multiple_Linear_Regression_Results/" +"combined_regression_plots" + "_" + analysistype.split(' ')[0] + "_" + analysistype.split(' ')[1] +".pdf"
    with PdfPages(pdf_filename) as pdf:
        pdf.savefig(fig, bbox_inches='tight', dpi=400)
        plt.close(fig)
    
    # Save results to CSV
    results_df = pd.DataFrame(results_list)
    results_df.to_csv(prefex + "Multiple_Linear_Regression_Results/" + "combined_regression_results" + "_" + analysistype.split(' ')[0] + "_" + analysistype.split(' ')[1] +".csv", index=False)

    return results_df.to_numpy().tolist()


def main():
    selected_files = []
    # Filter files that match the "Total_Nitrogen" condition
    for file in file_list:
        selected_files.append(file)
        # Run regression for all selected files in one call
        if selected_files:
             Multiple_Linear_Regression_Combined(selected_files, file.split('.')[0].split('_')[-2] + " " + file.split('.')[0].split('_')[-1], prefex)

    print("Job done!")


if __name__=="__main__":
    main()

Job done!
