<a href="https://colab.research.google.com/github/handochan/Data-Analysis/blob/main/ParameterScoring_iniVer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.api import OLS, add_constant
from statsmodels.tools.tools import add_constant
from scipy.stats import pearsonr
from scipy.stats import linregress
from sklearn.utils import resample

# Helper function for mean handling missing values
def meanXNA(series):
    return series.dropna().mean()

# Error grade function
def error_grade(ind_name):
    tmp = {
        "X": [ind_name],
        "pval": [1],
        "slope": [0],
        "pval.L": [1],
        "slope.L": [0],
        "pval.WL": [1],
        "slope.WL": [0],
        "Class": [""],
        "Grade": [0]
    }
    return pd.DataFrame(tmp)


def correlation(x, y):
    """
    Perform linear regression and return key metrics.

    Parameters:
        x (array-like): Independent variable.
        y (array-like): Dependent variable.

    Returns:
        dict: A dictionary containing p-value, r, r^2, slope, intercept, and degrees of freedom.
    """
    # Convert to numpy arrays for better performance
    x = np.array(x)
    y = np.array(y)

    # Filter out NaN values
    valid = ~np.isnan(x) & ~np.isnan(y)
    x = x[valid]
    y = y[valid]

    # Check if there is enough data for regression
    n = len(x)
    if n < 2:
        raise ValueError("Not enough data points for regression.")

    # Perform linear regression
    slope, intercept, r_value, p_value, std_err = linregress(x, y)

    # Compute additional metrics
    r_squared = r_value**2
    degrees_of_freedom = n - 2  # For simple linear regression

    # Package results into a dictionary
    results = {
        "slope": slope,
        "intercept": intercept,
        "r": r_value,
        "r_squared": r_squared,
        "p_value": p_value,
        "degrees_of_freedom": degrees_of_freedom
    }
    return results


# CalcGrade function
def calc_grade(df_ind_resp, ind_name, lot_col, wf_col, response_col, factor_col=None):
    """
    Calculate grade based on statistical correlations between variables.

    Parameters:
        df_ind_resp (pd.DataFrame): Input DataFrame.
        ind_name (str): Name of the independent variable.
        lot_col (str): Column name for lot identifier.
        wf_col (str): Column name for weight factor (not used directly in this implementation).
        response_col (str): Name of the response variable.
        factor_col (str, optional): Name of the factor column for additional grouping.

    Returns:
        pd.DataFrame: A DataFrame containing grades and statistical information.
    """
    df_tmp_f = df_ind_resp.copy()

    # Adjust for factors if provided
    if factor_col is not None:
        df_fac = df_tmp_f.groupby(factor_col).agg({
            response_col: meanXNA,
            ind_name: meanXNA
        }).reset_index().rename(columns={response_col: "facResp", ind_name: "facInd"})

        df_tmp_f = df_tmp_f.merge(df_fac, on=factor_col)
        df_tmp_f[ind_name] -= df_tmp_f["facInd"]
        df_tmp_f[response_col] -= df_tmp_f["facResp"]
        df_tmp_f[lot_col] = df_tmp_f[lot_col] + " " + df_tmp_f[factor_col].astype(str)

    # Aggregate by lot
    df_lot = df_tmp_f.groupby(lot_col).agg({
        response_col: meanXNA,
        ind_name: meanXNA
    }).reset_index().rename(columns={response_col: "lotResp", ind_name: "lotInd"})

    if df_tmp_f[ind_name].nunique() == 1:
        return error_grade(ind_name)

    df_tmp_f = df_tmp_f.merge(df_lot, on=lot_col)
    df_tmp_f["wLotInd"] = df_tmp_f[ind_name] - df_tmp_f["lotInd"]
    df_tmp_f["wLotResp"] = df_tmp_f[response_col] - df_tmp_f["lotResp"]

    # Perform linear regressions
    try:
        lot_results = correlation(df_lot["lotInd"], df_lot["lotResp"])
        wlot_results = correlation(df_tmp_f["wLotInd"], df_tmp_f["wLotResp"])
        overall_results = correlation(df_tmp_f[ind_name], df_tmp_f[response_col])
    except ValueError:
        return error_grade(ind_name)

    # Extract key metrics
    slope_lot, pval_lot, rsq_lot = lot_results["slope"], lot_results["p_value"], lot_results["r_squared"]
    slope_wlot, pval_wlot, rsq_wlot = wlot_results["slope"], wlot_results["p_value"], wlot_results["r_squared"]
    slope_w, pval_w = overall_results["slope"], overall_results["p_value"]

    sigma_lot = np.sqrt(df_lot["lotInd"].var())
    sigma_wlot = np.sqrt(df_tmp_f["wLotInd"].var())
    range_ratio = 2 * sigma_wlot / (sigma_lot + 2 * sigma_wlot)

    if np.isnan(slope_wlot * slope_lot * pval_lot * pval_wlot):
        return error_grade(ind_name)

    if slope_wlot * slope_lot > 0 and pval_lot < 0.2 and pval_wlot < 0.2:
        apval_lot = min(1, pval_lot / 0.2)
        apval_wlot = min(1, pval_wlot / 0.2)
        arange_ratio = min(2 / 3, max(1 / 3, range_ratio))
        sig_grade = 1 - (apval_lot * (1 - arange_ratio) + apval_wlot * arange_ratio)
        orig_grade = 1 - max(pval_lot, pval_wlot)
    else:
        sig_grade = 0
        apval_lot = min(1, pval_lot / 0.1)
        apval_wlot = min(1, pval_wlot / 0.1)
        if range_ratio < 0.2:
            sig_grade = min(1, (1 - apval_lot) * 2 * rsq_lot)
        elif range_ratio > 0.8:
            sig_grade = min(1, (1 - apval_wlot) * 5 * rsq_wlot)
        if pval_lot < 0.02 or pval_wlot < 0.02:
            orig_grade = 1 - max(pval_lot, pval_wlot)

    # Prepare result
    tmp = {
        "X": ind_name,
        "pval": pval_w,
        "slope": slope_w,
        "pval.L": pval_lot,
        "slope.L": slope_lot,
        "pval.WL": pval_wlot,
        "slope.WL": slope_wlot,
        "Class": "",
        "Grade": sig_grade
    }

    return tmp



def CheckCorrGrade(df_ind, ind_name, lot_col, wf_col, response_col, factors=["UniqueModule", "TimePeriod"], cutoff_p_val=0.05):
    # Filter dataframe
    df_ind_resp = df_ind[df_ind[ind_name].notna()][[lot_col, wf_col, response_col, ind_name] + factors]

    if df_ind_resp.empty:
        base_stats = error_grade(ind_name)
        base_stats["Class"] = ""
        base_stats["Note"] = ""
        base_stats["base_grade"] = 0
        base_stats["adj_grade"] = 0
        base_stats["best_grade"] = 0
        return base_stats

    factor_col = None
    base_stats = calc_grade(df_ind_resp, ind_name, lot_col, wf_col, response_col, factor_col)
    base_grade = base_stats["Grade"]
    adj_grade = base_grade
    best_grade = base_grade

    # print(base_grade)

    # Determine classification
    if base_grade > 0.5:
        grade_class = "Strong"
    elif base_grade > 0:
        grade_class = "Marginal"
    else:
        grade_class = ""

    note = ""

    # Iterate through factors
    for factor_col in factors:
        fact_stats = calc_grade(df_ind_resp, ind_name, lot_col, wf_col, response_col, factor_col)
        fact_grade = fact_stats["Grade"]
        adj_grade = min(adj_grade, fact_grade)
        best_grade = max(best_grade, fact_grade)

        if fact_grade < 0.5 * base_grade:
            if grade_class:
                grade_class += "/"
            grade_class += "Confounded"
            note += f" Confounded by {factor_col}"

        if fact_grade > 2 * (0.1 + base_grade):
            if grade_class:
                grade_class += "/"
            grade_class += "Disguised"
            note += f" Disguised by {factor_col}"

    # Update base stats
    base_stats["Class"] = grade_class
    base_stats["Note"] = note
    base_stats["base_grade"] = base_grade
    base_stats["adj_grade"] = adj_grade
    base_stats["best_grade"] = best_grade

    # Compute final grade
    final_grade = best_grade
    if final_grade > 0:
        final_grade = (0.5 + final_grade * 0.5) ** 2

    # Normalize by "UniqueModule"
    if final_grade > 0:
        df_tmp = df_ind_resp.copy()
        df_tmp = NormalizeByFactor(df_tmp, response_col, ind_name, "UniqueModule")
        mult_factor = SlopeMultiplier(df_tmp, response_col, ind_name, "UniqueModule", None)
        final_grade *= mult_factor

        # Normalize by "TimePeriod"
        df_tmp = NormalizeByFactor(df_tmp, response_col, ind_name, "TimePeriod")
        mult_factor = SlopeMultiplier(df_tmp, response_col, ind_name, "TimePeriod", None)
        final_grade *= mult_factor

    base_stats["Grade"] = final_grade

    return base_stats


# NormalizeByFactor function
def NormalizeByFactor(df_tmp, resp_col, ind_name, factor):
    # Aggregate mean values
    df_fac = df_tmp.groupby(factor).agg(
        facResp=(resp_col, meanXNA),
        facInd=(ind_name, meanXNA)
    ).reset_index()

    # Adjust data types
    if pd.api.types.is_numeric_dtype(df_tmp[factor]):
        df_fac[factor] = pd.to_numeric(df_fac[factor])
    else:
        df_fac[factor] = df_fac[factor].astype(str)

    # Merge and normalize
    df_tmp = pd.merge(df_tmp, df_fac, how="left", on=factor)
    df_tmp[ind_name] = df_tmp[ind_name] - df_tmp["facInd"]
    df_tmp[resp_col] = df_tmp[resp_col] - df_tmp["facResp"]

    return df_tmp

# SlopeMultiplier function
def SlopeMultiplier(df_tmp, resp_col, ind_name, factor, pres=None):
    df_tmp_f = df_tmp.copy()
    df_tmp_f["Lot"] = df_tmp_f["Lot"].astype(str) + "_" + df_tmp_f[factor].astype(str)

    # Aggregate mean values by "Lot"
    df_lot = df_tmp_f.groupby("Lot").agg(
        lotResp=(resp_col, meanXNA),
        lotInd=(ind_name, meanXNA)
    ).reset_index()

    # Merge aggregated data
    df_tmp_f = pd.merge(df_tmp_f, df_lot, how="left", on="Lot")
    df_tmp_f["wLotInd"] = df_tmp_f[ind_name] - df_tmp_f["lotInd"]
    df_tmp_f["wLotResp"] = df_tmp_f[resp_col] - df_tmp_f["lotResp"]

    # Fit linear model
    try:
        model = correlation(df_tmp_f[ind_name], df_tmp_f[resp_col])
        slp = model['slope']
    except Exception:
        slp = None

    # Initialize slope counts
    lvs = df_tmp_f[factor].unique()
    num_slp = 0
    num_cons_slp = 0

    if slp is not None and len(lvs) > 1:
        for lvl in lvs:
            df_tmp_l = df_tmp_f[df_tmp_f[factor] == lvl]
            if len(df_tmp_l) < 2:
                continue
            num_slp += 2

            # Check individual slopes
            try:
                model_w = correlation(df_tmp_l[ind_name], df_tmp_l[resp_col])
                if slp * model_w['slope'] > 0 and abs(model_w['slope']) > 0.05 * abs(slp):
                    num_cons_slp += 1
            except Exception:
                continue

            # Check within-lot slopes
            if len(df_tmp_l["wLotInd"].unique()) >= 2:
                try:
                    model_wl = correlation(df_tmp_l["wLotInd"], df_tmp_l["wLotResp"])
                    if slp * model_wl['slope'] > 0 and abs(model_wl['slope']) > 0.05 * abs(slp):
                        num_cons_slp += 1
                except Exception:
                    continue

            # Lot-level slopes
            df_tmp_l_lot = df_tmp_l[["Lot", "lotInd", "lotResp"]].drop_duplicates()
            if len(df_tmp_l_lot) >= 2 and len(df_tmp_l_lot["lotInd"].unique()) >= 2:
                try:
                    model_l = OLS(df_tmp_l_lot["lotResp"], add_constant(df_tmp_l_lot["lotInd"])).fit()
                    if slp * model_l.params[1] > 0 and abs(model_l.params[1]) > 0.05 * abs(slp):
                        num_cons_slp += 1
                except Exception:
                    continue

    # Return slope multiplier
    if num_slp > 0:
        return num_cons_slp / num_slp
    else:
        return 0.75




def compute_neighbors(df_wafer, ind_name, response_col, ind_names, r_mag_limit=0.95):
    """
    Find correlated neighbors for a given indicator in a DataFrame.

    Parameters:
        df_wafer (pd.DataFrame): Input data.
        ind_name (str): Main indicator name.
        response_col (str): Response variable name.
        ind_names (list): List of all indicator names.
        r_mag_limit (float): Threshold for correlation magnitude.

    Returns:
        pd.DataFrame or None: DataFrame with correlation and p-value information, or None if no neighbors are found.
    """
    # Ensure the main indicator is numeric and drop rows with NA in the indicator
    df_wafer[ind_name] = pd.to_numeric(df_wafer[ind_name], errors='coerce')
    df_wafer = df_wafer.dropna(subset=[ind_name])

    if df_wafer[ind_name].nunique() == 1:
        return None

    response_col_new = response_col

    # Find neighbors and their correlations
    cor_vals = []
    cor_inds = []

    for cor_ind in ind_names:
        if cor_ind == ind_name:
            continue
        df_wafer[cor_ind] = pd.to_numeric(df_wafer[cor_ind], errors='coerce')
        if not df_wafer[cor_ind].isna().all():
            valid_idx = ~df_wafer[cor_ind].isna()
            cor_val, _ = pearsonr(df_wafer.loc[valid_idx, cor_ind], df_wafer.loc[valid_idx, ind_name])
            cor_vals.append(cor_val)
            cor_inds.append(cor_ind)

    # Filter out NAs
    cor_df = pd.DataFrame({
        'MainInd': ind_name,
        'CorrelatedInd': cor_inds,
        'R.Val': cor_vals
    })
    cor_df['R.Mag'] = cor_df['R.Val'].abs()
    cor_df = cor_df.sort_values(by='R.Mag', ascending=False)
    cor_df = cor_df[(cor_df['R.Mag'] > 0.8) | (cor_df.index < 6)]

    if cor_df.empty:
        return None

    # Add a column for p-values
    p_value_col = f"{response_col_new}.pValue"
    cor_df[p_value_col] = np.nan

    for idx, row in cor_df.iterrows():
        correlated_ind = row['CorrelatedInd']
        df_wafer[correlated_ind] = pd.to_numeric(df_wafer[correlated_ind], errors='coerce')
        valid_wafer = df_wafer.dropna(subset=[response_col_new, correlated_ind])

        if valid_wafer.empty:
            cor_df.loc[idx, p_value_col] = np.nan
            continue

        try:
            model = ols(f"{response_col_new} ~ {correlated_ind}", data=valid_wafer).fit()
            cor_df.loc[idx, p_value_col] = model.pvalues[correlated_ind]
        except:
            cor_df.loc[idx, p_value_col] = np.nan

    # Rename the p-value column to 'pval'
    cor_df.rename(columns={p_value_col: 'pval'}, inplace=True)
    return cor_df



from statsmodels.formula.api import ols

def calc_neighbors_by_influence(df, df_grades, ind_names, resp_col, rsq_red_spec=0.50, pres=None):
    """
    Calculate influence of neighbors on the response variable.

    Parameters:
        df (pd.DataFrame): Input data.
        df_grades (pd.DataFrame): Grades DataFrame with indicator details.
        ind_names (list): List of indicator names.
        resp_col (str): Response column name.
        rsq_red_spec (float): R-squared reduction threshold.
        pres (optional): Not used in this implementation.

    Returns:
        pd.DataFrame: Updated grades DataFrame with influence information.
    """
    # Sort grades by 'Grade' in descending order
    df_grades = df_grades.sort_values(by='Grade', ascending=False).reset_index(drop=True)

    # Return immediately if there are fewer than 2 indicators
    if len(ind_names) < 2:
        return df_grades

    # Normalize df for all indicators by factor
    df_tmp = df.copy()
    for ind in ind_names:
        df_tmp = NormalizeByFactor(df_tmp, resp_col, ind, "UniqueModule")

    # Save the normalized data for reuse
    df_tmp_save = df_tmp.copy()

    for i in range(1, len(ind_names)):
        for j in range(i):
            # Filter rows where neither indicator is NaN
            valid_rows = df_tmp_save[df_tmp_save[df_grades.loc[i, 'X']].notna() &
                                     df_tmp_save[df_grades.loc[j, 'X']].notna()]
            if len(valid_rows) < 5:
                continue

            if df_grades.loc[j, 'Nebr.infl'] != df_grades.loc[j, 'X']:
                continue

            if valid_rows[df_grades.loc[i, 'X']].nunique() == 1 or valid_rows[df_grades.loc[j, 'X']].nunique() == 1:
                continue

            # Fit base model
            try:
                out_base = ols(f"{resp_col} ~ {df_grades.loc[i, 'X']}", data=valid_rows).fit()
                rsq_base = out_base.rsquared
            except:
                continue

            # Fit neighbor model
            try:
                out_nebr = ols(f"{resp_col} ~ {df_grades.loc[j, 'X']}", data=valid_rows).fit()
                valid_rows['resp_resid'] = out_nebr.resid
            except:
                continue

            # Fit residual model
            try:
                out_resid = ols(f"resp_resid ~ {df_grades.loc[i, 'X']}", data=valid_rows).fit()
                rsq_adj_base = out_resid.rsquared
                rsq_red = 1 - (rsq_adj_base / rsq_base)
            except:
                rsq_adj_base = None
                rsq_red = None

            # Update grades if reduction exceeds threshold
            if rsq_red is not None and rsq_red > rsq_red_spec:
                df_grades.loc[i, 'Nebr.infl'] = df_grades.loc[j, 'X']
                df_grades.loc[i, 'Nebr.inflRed'] = rsq_red
                break

    return df_grades



from scipy.stats import pearsonr

def compute_neighbors_just_one(df_wafer, ind_name, response_col, ind_names, r_mag_limit=0.95):
    """
    Identify the first neighbor with a correlation above the threshold.

    Parameters:
        df_wafer (pd.DataFrame): Input DataFrame.
        ind_name (str): Name of the main indicator.
        response_col (str): Response variable name.
        ind_names (list): List of all indicator names.
        r_mag_limit (float): Threshold for correlation magnitude.

    Returns:
        str: Name of the correlated indicator or the original indicator name if no match is found.
    """
    # Drop rows where the main indicator is NaN
    df_wafer = df_wafer.dropna(subset=[ind_name])

    # If the indicator has only one unique value, return the indicator name
    if df_wafer[ind_name].nunique() == 1:
        return ind_name

    # Iterate through other indicators to find neighbors
    for cor_ind in ind_names:
        if cor_ind == ind_name:
            continue

        # Ensure the correlated indicator is numeric and not NaN
        if cor_ind in df_wafer.columns:
            valid_idx = ~df_wafer[cor_ind].isna()
            if valid_idx.sum() < 2:  # Require at least two valid points
                continue

            # Calculate Pearson correlation
            try:
                cor_val, _ = pearsonr(df_wafer.loc[valid_idx, cor_ind], df_wafer.loc[valid_idx, ind_name])
            except ValueError:  # Handle cases where correlation can't be computed
                continue

            # Check if correlation exceeds the threshold
            if not np.isnan(cor_val) and cor_val > r_mag_limit:
                return cor_ind

    # Return the original indicator name if no neighbor exceeds the threshold
    return ind_name

def parameter_correlation(table, mapping_table, resp_col_index, split_col_index, all_inds, num_periods, cutoff_pval, pft_constant="PFTConstant", is_indicator_data=False, need_group_summary=False):
    """
    Perform parameter correlation analysis based on indicators and response variables.

    Parameters:
        table (pd.DataFrame): Main data table.
        mapping_table (pd.DataFrame): Mapping table containing metadata.
        resp_col_index (int): Index of the response column.
        split_col_index (int or None): Index of the split column (if any).
        ind_cols_index (list): List of indicator column indices.
        num_periods (int): Number of periods to split the data into.
        cutoff_pval (float): p-value cutoff for statistical significance.
        pft_constant (str): Column name for PFT constant in the mapping table.
        is_indicator_data (bool): Whether the data is indicator data.
        need_group_summary (bool): Whether to compute group summaries.

    Returns:
        dict: A dictionary containing correlation tables, neighbors, PFT table, and group summary table.
    """
    stats = ["MeanT", "Min", "Max", "StdDev", "Area", "Duration", "Median", "Range"]
    stat_weights = [1, 0.9, 0.9, 0.5, 0.7, 1, 0.7, 0.8]
    pfts = mapping_table[pft_constant].dropna().drop_duplicates()

    lot_col="Lot"
    wf_col="Wafer"

    # Normalize columns for non-indicator data
    if not is_indicator_data:
        for pft in pfts:
            block_id = mapping_table.loc[mapping_table[pft_constant] == pft, "blockID"].iloc[0]
            for col_type in ["Chamber", "dxcontextid", "fdccontextid", "pequipment"]:
                col = mapping_table.loc[(mapping_table[pft_constant] == pft) & (mapping_table["Type"] == col_type), "columnName"]
                if col.empty:
                    col_name = f"{pft}_{col_type}"
                    table[col_name] = 0
                    mapping_table = pd.concat([mapping_table, pd.DataFrame({
                        "columnName": [col_name],
                        pft_constant: [pft],
                        "blockID": [block_id],
                        "Type": [col_type]
                    })])

    response_col = table.columns[resp_col_index]
    # all_inds = table.columns[ind_cols_index]
    split_col = None if split_col_index is None else table.columns[split_col_index]
    r_mag_limit = 0.95
    cutoff_pval = float(cutoff_pval)
    num_periods = int(num_periods)

    cor_res_all = None
    nebrs_all = None
    pft_table_all = None
    cor_group_res = None

    # Prepare the data if split column exists
    if split_col:
        table["oldLot"] = table[split_col]
        table["oldWafer"] = table[split_col]
        table["lotCol"] = table[split_col] + "_" + table["oldLot"]
        table["wfCol"] = table[split_col] + "_" + table["oldWafer"]

    for pft in pfts:
        # print(pft)
        if not pft:
            continue

        pft_cols = mapping_table.loc[mapping_table[pft_constant] == pft, "columnName"]
        pft_cols = pft_cols[pft_cols.isin(table.columns)]

        if len(pft_cols) == 0:
            continue

        df_ind = table[[response_col] + pft_cols.tolist()]
        df_ind = df_ind.dropna(subset=[response_col])

        if df_ind.empty:
            continue

        ind_names = [col for col in all_inds if col in pft_cols.tolist()]
        # print(ind_names)
        if not ind_names:
            continue

        # Time column
        time_col = mapping_table.loc[(mapping_table[pft_constant] == pft) & (mapping_table["Type"] == "start_time"), "columnName"]
        if time_col.empty:
            continue
        time_col = time_col.iloc[0]

        df_ind["timeCol1"] = pd.to_datetime(df_ind[time_col], errors="coerce")
        df_ind = df_ind.dropna(subset=["timeCol1"])
        if df_ind.empty:
            continue

        df_ind = df_ind.sort_values(by="timeCol1")
        time_vals = np.quantile(df_ind["timeCol1"].values.astype(np.int64), q=np.linspace(0, 1, num_periods + 1)[1:])
        df_ind["TimePeriod"] = pd.cut(df_ind["timeCol1"].values.astype(np.int64), bins=[-np.inf] + time_vals.tolist(), labels=False)

        # Add UniqueModule column
        chamber_col = mapping_table.loc[(mapping_table[pft_constant] == pft) & (mapping_table["Type"] == "Chamber"), "columnName"].iloc[0]
        eqp_col = mapping_table.loc[(mapping_table[pft_constant] == pft) & (mapping_table["Type"] == "pequipment"), "columnName"].iloc[0]
        df_ind["UniqueModule"] = df_ind[eqp_col].astype(str) + ":" + df_ind[chamber_col].astype(str)

        # Perform correlation analysis for each indicator
        cor_res = []
        for ind_name in ind_names:
            try:
                if df_ind[ind_name].isna().all():
                    continue

                tmp = CheckCorrGrade(
                    df_ind, ind_name, lot_col, wf_col, response_col=response_col,
                    factors=["UniqueModule", "TimePeriod"], cutoff_p_val=cutoff_pval
                )
                if tmp is not None:
                    cor_res.append(tmp)
            except Exception as e:
                a = e # need to be modified

        if cor_res:
            # cor_res = pd.concat(cor_res, ignore_index=True)
            cor_res = pd.DataFrame(cor_res)
            cor_res.sort_values(by="Grade", ascending=False, inplace=True)
            cor_res["Nebr"] = cor_res["X"]
            cor_res[pft_constant] = pft

        # Additional correlation adjustments
        if cor_res is not None:
            cor_res["Grade.UnWeighted"] = cor_res["Grade"]
            for stat, weight in zip(stats, stat_weights):
                mask = cor_res["X"].str.contains(stat) & cor_res["Grade.UnWeighted"].notna()
                cor_res.loc[mask, "Grade"] *= weight
            cor_res.sort_values(by="Grade", ascending=False, inplace=True)



        all_cor_indx = None
        if cor_res is not None:
            cor_res = cor_res.sort_values(by="Grade", ascending=False)
            cor_res["Nebr"] = cor_res["X"]
            cor_res[pft_constant] = pft

            sig_df = cor_res[cor_res["Class"] != ""]
            if not sig_df.empty:
                for k in range(len(sig_df)):
                    df_wafer = df_ind.copy()
                    x_col = sig_df.iloc[k, 0]  # first row
                    ind_name = x_col

                    # Compute neighbors
                    cor_indx = compute_neighbors(df_ind, ind_name, response_col, ind_names, r_mag_limit)
                    cor_indx[pft_constant] = pft
                    cor_indx["Y"] = response_col

                    # Append to AllCorIndx
                    if all_cor_indx is None:
                        all_cor_indx = cor_indx
                    else:
                        all_cor_indx = pd.concat([all_cor_indx, cor_indx], ignore_index=True)

                    if k > 0:
                        sig_inds = sig_df["X"].iloc[:k].tolist()
                        nebr = compute_neighbors_just_one(df_ind, ind_name, response_col, sig_inds, r_mag_limit)
                        cor_res.loc[cor_res["X"] == ind_name, "Nebr"] = nebr

        if cor_res is not None:
            cor_res["Grade.UnWeighted"] = cor_res["Grade"]
            stats_weighted = [False] * len(cor_res)

            for i, stat in enumerate(stats):
                mask = cor_res["X"].str.contains(stat) & ~pd.Series(stats_weighted)
                stats_weighted = stats_weighted | mask
                cor_res.loc[mask, "Grade"] *= stat_weights[i]

            cor_res = cor_res.sort_values(by="Grade", ascending=False)
            cor_res["Nebr.infl"] = cor_res["X"]
            cor_res["Nebr.inflRed"] = None
            sig_inds = cor_res.loc[cor_res["Grade"] > 0, "X"].tolist()

            # Calculate neighbors by influence
            cor_res = calc_neighbors_by_influence(df_ind, cor_res, sig_inds, response_col, rsq_red_spec=0.5, pres=None)

            # cor_res = cor_res[[
            #     "X", "Grade", "Nebr.infl", "Nebr.inflRed", "Class", "Note", "Grade.UnWeighted",
            #     "pval", "pval.L", "pval.WL", "base.grade", "adj.grade", "best.grade", "slope",
            #     "slope.L", "slope.WL", "Nebr", "Y", pft_constant
            # ]]
            cor_res = cor_res[[
                "X", "Grade", "Nebr.infl", "Nebr.inflRed", "Class", "Note", "Grade.UnWeighted",
                "pval", "pval.L", "pval.WL", "slope",
                "slope.L", "slope.WL", "Nebr", "Y", pft_constant
            ]]

            num_sig = (cor_res["Class"] != "").sum()
            num_parameters = len(cor_res)
            pft_table = pd.DataFrame([{
                pft_constant: pft,
                "SignificantParameters": num_sig,
                "TotalParameters": num_parameters,
                "TaggedParameters": 0,
                "Comments": "",
                "numWafers": 0,
                "uniqueWafers": 0
            }])

            if pft_table_all is None:
                pft_table_all = pft_table
            else:
                pft_table_all = pd.concat([pft_table_all, pft_table], ignore_index=True)

            if cor_res_all is None:
                cor_res_all = cor_res
            else:
                cor_res_all = pd.concat([cor_res_all, cor_res], ignore_index=True)

            if nebrs_all is None:
                nebrs_all = all_cor_indx
            else:
                nebrs_all = pd.concat([nebrs_all, all_cor_indx], ignore_index=True)

        # Group Correlations

        all_group_cor_indx = None
        if cor_group_res is not None:
            cor_group_res = cor_group_res.sort_values(by="Grade", ascending=False)
            cor_group_res["Nebr"] = cor_group_res["X"]
            cor_group_res[pft_constant] = pft

            sig_df = cor_group_res[cor_group_res["Class"] != ""]
            if not sig_df.empty:
                for k in range(len(sig_df)):
                    df_wafer = df_ind.copy()
                    x_col = sig_df.iloc[k, 0]
                    ind_name = x_col

                    cor_indx = compute_neighbors(df_ind, ind_name, response_col, ind_names, r_mag_limit)
                    cor_indx[pft_constant] = pft
                    cor_indx["Y"] = response_col

                    if all_group_cor_indx is None:
                        all_group_cor_indx = cor_indx
                    else:
                        all_group_cor_indx = pd.concat([all_group_cor_indx, cor_indx], ignore_index=True)

                    if k > 0:
                        sig_inds = sig_df["X"].iloc[:k].tolist()
                        nebr = compute_neighbors_just_one(df_ind, ind_name, response_col, sig_inds, r_mag_limit)
                        cor_group_res.loc[cor_group_res["X"] == ind_name, "Nebr"] = nebr


    return {
        "CorTable": cor_res_all,
        "Nebrs": nebrs_all,
        "PFTTable": pft_table_all,
        "GroupSummaryTable": cor_group_res
    }
