In [1]:
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import os
from sklearn.linear_model import RidgeCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import scipy.stats as stats
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
# load data

#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Total_Phosphorus/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Total_Nitrogen/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Nitrate_Nitrogen/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Dissolved_Phosphorus/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Particulate_Phosphorus/'
prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Ammoniacal_Nitrogen/'
#prefex = '/Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_leaching_sample_30cm/'

file_list = [f for f in os.listdir(prefex) if f.endswith(".csv")]
print(file_list)
    

['Water_sample_runoff_citrus_Ammoniacal_Nitrogen.csv', 'Water_sample_runoff_corn_Ammoniacal_Nitrogen.csv', 'Water_sample_runoff_rice_Ammoniacal_Nitrogen.csv', 'Water_sample_runoff_vegetable_Ammoniacal_Nitrogen.csv']


In [3]:
def QQ_Plot_Combined(files, analysistype, prefex):
    """Generates Residual and QQ plots for multiple regression models and saves them in a single figure."""
    
    dependent_columns = ['NPK', 'PK', 'NK', 'CK', 'OF']
    num_files = len(files)
    num_vars = len(dependent_columns)

    # Create a large figure (2 rows: Residual Plot & QQ Plot, columns = dependent variables)
    fig, axes = plt.subplots(nrows=2 * num_files, ncols=num_vars, figsize=(6 * num_vars, 6 * 2 * num_files))

    for file_idx, file_name in enumerate(files):
        # Load dataset
        data = pd.read_csv(prefex + file_name, encoding='latin1')

        # Define independent variables (X)
        X = data.iloc[:, 6:].drop(data.columns[7], axis=1)  # Drop second column (as per previous code)
        X = sm.add_constant(X)

        for var_idx, column in enumerate(dependent_columns):
            if column not in data:
                continue  # Skip if column is missing

            y = data[column].dropna()
            X_filtered, y_filtered = X.align(y, join="inner", axis=0)

            # Fit OLS model
            model = sm.OLS(y_filtered, X_filtered).fit()

            # Select axes for plotting
            resid_ax = axes[2 * file_idx, var_idx] if num_files > 1 else axes[0, var_idx]
            qq_ax = axes[2 * file_idx + 1, var_idx] if num_files > 1 else axes[1, var_idx]

            # **1️⃣ Residual Plot (Detects heteroscedasticity)**
            sns.residplot(x=model.fittedvalues, y=model.resid, lowess=True, ax=resid_ax, color="blue", scatter_kws={'alpha': 0.6, 's': 100, 'edgecolor': 'black'})  # Set border color to black

            resid_ax.axhline(y=0, color="red", linestyle="--", linewidth=2)
            resid_ax.set_title(f"Residual Plot ({column} - {file_name.split('.')[0].split('_')[3]})", fontsize=20)
            resid_ax.set_xlabel("Fitted Values", fontsize=16)
            resid_ax.set_ylabel("Residuals", fontsize=16)
            resid_ax.tick_params(axis='both', labelsize=14)

            # **2️⃣ QQ Plot (Normality of residuals)**
            res = stats.probplot(model.resid, dist="norm", plot=qq_ax)
            # Set marker color to green
            qq_ax.get_lines()[0].set_markerfacecolor('green')  # Set color to green
            qq_ax.get_lines()[0].set_markeredgecolor('black')
            qq_ax.get_lines()[0].set_markersize(12)  # Adjust marker size here (8 is just an example)

            qq_ax.set_title(f"QQ Plot ({column} - {file_name.split('.')[0].split('_')[3]})", fontsize=20)
            qq_ax.set_xlabel("Theoretical quantiles", fontsize=16)
            qq_ax.set_ylabel("Ordered Values", fontsize=16)
            qq_ax.tick_params(axis='both', labelsize=14)

    # Adjust layout
    plt.tight_layout()
    plt.suptitle(f"Residual & QQ Plots for {prefex.split('/')[-3]} ({analysistype})", fontsize=26, y=1.02)

    # Save to PDF
    pdf_filename = prefex + "Multiple_Linear_Regression_Results/" + "combined_resid_qq_plots" + "_" + analysistype.split(' ')[0] + "_" + analysistype.split(' ')[1] +".pdf"
    with PdfPages(pdf_filename) as pdf:
        pdf.savefig(fig, bbox_inches='tight', dpi=400)
        plt.close(fig)

    print(f"Residual & QQ plots saved to: {pdf_filename}")



def main():
    selected_files = []
    # Filter files that match the "Total_Nitrogen" condition
    for file in file_list:
        selected_files.append(file)
        # Run regression for all selected files in one call
        if selected_files:
             QQ_Plot_Combined(selected_files, file.split('.')[0].split('_')[-2] + " " + file.split('.')[0].split('_')[-1], prefex)

    print("Job done!")


if __name__=="__main__":
    main()

Residual & QQ plots saved to: /Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Ammoniacal_Nitrogen/Multiple_Linear_Regression_Results/combined_resid_qq_plots_Ammoniacal_Nitrogen.pdf
Residual & QQ plots saved to: /Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Ammoniacal_Nitrogen/Multiple_Linear_Regression_Results/combined_resid_qq_plots_Ammoniacal_Nitrogen.pdf
Residual & QQ plots saved to: /Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Ammoniacal_Nitrogen/Multiple_Linear_Regression_Results/combined_resid_qq_plots_Ammoniacal_Nitrogen.pdf
Residual & QQ plots saved to: /Users/rjing/Desktop/Machine_Learning_Nonpoint_Source_Pollution/data/water_runoff_sample/Ammoniacal_Nitrogen/Multiple_Linear_Regression_Results/combined_resid_qq_plots_Ammoniacal_Nitrogen.pdf
Job done!
