In [3]:
import pandas as pd
from scipy.stats import chi2_contingency, f_oneway

# Load the dataset
file_path = 'C:/Users/user/Desktop/진짜임 이게 찐.csv'  # CSV 파일 경로 입력
data = pd.read_csv(file_path, encoding='cp949')

# List of categorical columns
categorical_columns = ['토양깊이유형', '토성코드', '토양형코드', '토양유효수분량']

# Ensure categorical columns are treated as objects
for col in categorical_columns:
    data[col] = data[col].astype('object')

# Ensure production columns are numeric
production_columns = ['밤 (kg)', '복분자딸기 (kg)', '오갈피 (kg)', '마 (kg)', '도라지 (kg)', '더덕 (kg)', '생표고 (kg)']
for col in production_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)

# Function to perform Chi-Square and ANOVA analysis
def perform_analysis_for_production(data, production_col, categorical_cols):
    """
    Perform Chi-Square and ANOVA analysis for a given production column, excluding rows where production is 0.

    Parameters:
        data (DataFrame): The input dataset.
        production_col (str): The production column to analyze.
        categorical_cols (list): The list of categorical columns to analyze.

    Returns:
        chi2_results_df (DataFrame): DataFrame of Chi-Square p-values.
        anova_results_df (DataFrame): DataFrame of ANOVA p-values.
    """
    # Filter the data to exclude rows where production is 0
    filtered_data = data[data[production_col] != 0].copy()

    # Recalculate Chi-Square test
    chi2_results = {
        cat_col: {
            'p-value': chi2_contingency(pd.crosstab(filtered_data[cat_col], filtered_data[production_col]))[1]
        }
        for cat_col in categorical_cols
    }

    # Recalculate ANOVA test
    anova_results = {
        cat_col: {
            'p-value': f_oneway(*[group[production_col].values for name, group in filtered_data.groupby(cat_col)])[1]
        }
        for cat_col in categorical_cols
    }

    # Convert results into DataFrames
    chi2_results_df = pd.DataFrame.from_dict(chi2_results, orient='index', columns=['p-value'])
    anova_results_df = pd.DataFrame.from_dict(anova_results, orient='index', columns=['p-value'])

    return chi2_results_df, anova_results_df

# Iterate over all production columns and calculate Chi-Square and ANOVA
all_results = {}
significant_results_cleaned = {}

for production_col in production_columns:
    print(f"Performing analysis for {production_col}...\n")
    
    chi2_results, anova_results = perform_analysis_for_production(data, production_col, categorical_columns)
    all_results[production_col] = {'Chi-Square': chi2_results, 'ANOVA': anova_results}
    
    # Filter significant results (p-value < 0.05)
    chi2_significant = chi2_results[chi2_results['p-value'] < 0.05]
    anova_significant = anova_results[anova_results['p-value'] < 0.05]
    
    if not chi2_significant.empty or not anova_significant.empty:
        significant_results_cleaned[production_col] = {
            'Chi-Square': chi2_significant,
            'ANOVA': anova_significant
        }

# Display all results
print("\n==== All Results ====")
for production_col, results in all_results.items():
    print(f"\n{production_col} - Chi-Square Results:")
    print(results['Chi-Square'])
    print(f"\n{production_col} - ANOVA Results:")
    print(results['ANOVA'])

# Display significant results only if they exist
print("\n==== Significant Results (p-value < 0.05) ====")
for production_col, results in significant_results_cleaned.items():
    print(f"\n{production_col}:")
    if not results['Chi-Square'].empty:
        print("\nChi-Square Significant Results:")
        print(results['Chi-Square'])
    if not results['ANOVA'].empty:
        print("\nANOVA Significant Results:")
        print(results['ANOVA'])


Performing analysis for 밤 (kg)...

Performing analysis for 복분자딸기 (kg)...

Performing analysis for 오갈피 (kg)...

Performing analysis for 마 (kg)...

Performing analysis for 도라지 (kg)...

Performing analysis for 더덕 (kg)...

Performing analysis for 생표고 (kg)...


==== All Results ====

밤 (kg) - Chi-Square Results:
          p-value
토양깊이유형   0.453127
토성코드     0.453127
토양형코드    0.435978
토양유효수분량  0.448365

밤 (kg) - ANOVA Results:
          p-value
토양깊이유형   0.878330
토성코드     0.170395
토양형코드    0.351903
토양유효수분량  0.532277

복분자딸기 (kg) - Chi-Square Results:
          p-value
토양깊이유형   0.425951
토성코드     0.552794
토양형코드    0.984651
토양유효수분량  0.520064

복분자딸기 (kg) - ANOVA Results:
          p-value
토양깊이유형   0.973020
토성코드     0.740754
토양형코드    0.554722
토양유효수분량  0.000004

오갈피 (kg) - Chi-Square Results:
          p-value
토양깊이유형   0.962914
토성코드     0.376942
토양형코드    0.121788
토양유효수분량  0.929333

오갈피 (kg) - ANOVA Results:
          p-value
토양깊이유형   0.811761
토성코드     0.436857
토양형코드    0.953100
토양유효수분량  0.096380

마 (

In [10]:
import pandas as pd
from scipy.stats import chi2_contingency, f_oneway, spearmanr

# Load the dataset
file_path = 'C:/Users/user/Desktop/진짜임 이게 찐.csv'  # CSV 파일 경로 입력
data = pd.read_csv(file_path, encoding='cp949')

# List of categorical columns
categorical_columns = ['토양깊이유형', '토성코드', '토양형코드', '토양유효수분량']

# Ensure categorical columns are treated as objects
for col in categorical_columns:
    data[col] = data[col].astype('object')

# Ensure production columns are numeric
production_columns = ['밤 (kg)', '복분자딸기 (kg)', '오갈피 (kg)', '마 (kg)', '도라지 (kg)', '더덕 (kg)', '생표고 (kg)']
for col in production_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)

# Function to perform Chi-Square and ANOVA analysis
def perform_analysis_for_production(data, production_col, categorical_cols):
    """
    Perform Chi-Square and ANOVA analysis for a given production column, excluding rows where production is 0.

    Parameters:
        data (DataFrame): The input dataset.
        production_col (str): The production column to analyze.
        categorical_cols (list): The list of categorical columns to analyze.

    Returns:
        chi2_results_df (DataFrame): DataFrame of Chi-Square p-values.
        anova_results_df (DataFrame): DataFrame of ANOVA p-values.
    """
    # Filter the data to exclude rows where production is 0
    filtered_data = data[data[production_col] != 0].copy()

    # Recalculate Chi-Square test
    chi2_results = {
        cat_col: {
            'p-value': chi2_contingency(pd.crosstab(filtered_data[cat_col], filtered_data[production_col]))[1]
        }
        for cat_col in categorical_cols
    }

    # Recalculate ANOVA test
    anova_results = {
        cat_col: {
            'p-value': f_oneway(*[group[production_col].values for name, group in filtered_data.groupby(cat_col)])[1]
        }
        for cat_col in categorical_cols
    }

    # Convert results into DataFrames
    chi2_results_df = pd.DataFrame.from_dict(chi2_results, orient='index', columns=['p-value'])
    anova_results_df = pd.DataFrame.from_dict(anova_results, orient='index', columns=['p-value'])

    return chi2_results_df, anova_results_df

# Function to calculate Spearman correlation
def calculate_spearman_correlation(data, production_col, categorical_cols):
    """
    Calculate Spearman correlation coefficients for a given production column, excluding rows where production is 0.

    Parameters:
        data (DataFrame): The input dataset.
        production_col (str): The production column to analyze.
        categorical_cols (list): The list of categorical columns to analyze.

    Returns:
        spearman_results_df (DataFrame): DataFrame of Spearman correlation coefficients and p-values.
    """
    # Filter the data to exclude rows where production is 0
    filtered_data = data[data[production_col] != 0].copy()

    # Ensure categorical columns are numeric for correlation calculation
    for col in categorical_cols:
        filtered_data[col] = pd.factorize(filtered_data[col])[0]

    # Calculate Spearman correlation for each categorical column
    spearman_results = {
        col: {
            'Spearman Correlation': spearmanr(filtered_data[col], filtered_data[production_col])[0],
            'p-value': spearmanr(filtered_data[col], filtered_data[production_col])[1]
        }
        for col in categorical_cols
    }

    # Convert results into a DataFrame
    spearman_results_df = pd.DataFrame.from_dict(spearman_results, orient='index')

    return spearman_results_df

# Iterate over all production columns and calculate analyses
all_results = {}
significant_results_cleaned = {}

for production_col in production_columns:
    print(f"Performing analysis for {production_col}...\n")
    
    # Perform Chi-Square and ANOVA
    chi2_results, anova_results = perform_analysis_for_production(data, production_col, categorical_columns)
    # Perform Spearman correlation
    spearman_results = calculate_spearman_correlation(data, production_col, categorical_columns)
    
    all_results[production_col] = {
        'Chi-Square': chi2_results,
        'ANOVA': anova_results,
        'Spearman': spearman_results
    }
    
    # Filter significant results (p-value < 0.05)
    chi2_significant = chi2_results[chi2_results['p-value'] < 0.1]
    anova_significant = anova_results[anova_results['p-value'] < 0.1]
    spearman_significant = spearman_results[spearman_results['p-value'] < 0.1]
    
    if not chi2_significant.empty or not anova_significant.empty or not spearman_significant.empty:
        significant_results_cleaned[production_col] = {
            'Chi-Square': chi2_significant,
            'ANOVA': anova_significant,
            'Spearman': spearman_significant
        }

# Display all results
print("\n==== All Results ====")
for production_col, results in all_results.items():
    print(f"\n{production_col} - Chi-Square Results:")
    print(results['Chi-Square'])
    print(f"\n{production_col} - ANOVA Results:")
    print(results['ANOVA'])
    print(f"\n{production_col} - Spearman Correlation Results:")
    print(results['Spearman'])

# Display significant results only if they exist
print("\n==== Significant Results (p-value < 0.1) ====")
for production_col, results in significant_results_cleaned.items():
    print(f"\n{production_col}:")
    if not results['Chi-Square'].empty:
        print("\nChi-Square Significant Results:")
        print(results['Chi-Square'])
    if not results['ANOVA'].empty:
        print("\nANOVA Significant Results:")
        print(results['ANOVA'])
    if not results['Spearman'].empty:
        print("\nSpearman Correlation Significant Results:")
        print(results['Spearman'])


Performing analysis for 밤 (kg)...

Performing analysis for 복분자딸기 (kg)...

Performing analysis for 오갈피 (kg)...

Performing analysis for 마 (kg)...

Performing analysis for 도라지 (kg)...

Performing analysis for 더덕 (kg)...

Performing analysis for 생표고 (kg)...


==== All Results ====

밤 (kg) - Chi-Square Results:
          p-value
토양깊이유형   0.453127
토성코드     0.453127
토양형코드    0.435978
토양유효수분량  0.448365

밤 (kg) - ANOVA Results:
          p-value
토양깊이유형   0.878330
토성코드     0.170395
토양형코드    0.351903
토양유효수분량  0.532277

밤 (kg) - Spearman Correlation Results:
         Spearman Correlation   p-value
토양깊이유형              -0.019042  0.809935
토성코드                -0.158590  0.043835
토양형코드                0.047614  0.547390
토양유효수분량              0.000273  0.997252

복분자딸기 (kg) - Chi-Square Results:
          p-value
토양깊이유형   0.425951
토성코드     0.552794
토양형코드    0.984651
토양유효수분량  0.520064

복분자딸기 (kg) - ANOVA Results:
          p-value
토양깊이유형   0.973020
토성코드     0.740754
토양형코드    0.554722
토양유효수분량  0.000004

복분