In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
target_classes = ["Calanoid_1", "Cyclopoid_1"]

In [3]:
geometric_features = ['Area..ABD.', 'Area..Filled.', 'Diameter..ABD.', 'Diameter..ESD.', 'Diameter..FD.',
                      'Length','Width', 'Perimeter', 'Volume..ABD.', 'Volume..ESD.', 'Geodesic.Length', 
                      'Geodesic.Thickness']

shape_features = ['Aspect.Ratio', 'Circle.Fit', 'Circularity', 'Circularity..Hu.', 'Compactness', 
                  'Convex.Perimeter', 'Convexity', 'Fiber.Curl', 'Fiber.Straightness', 
                  'Geodesic.Aspect.Ratio', 'Roughness', 'Elongation', 'Symmetry']

optical_features = ['Edge.Gradient', 'Intensity','Sigma.Intensity', 'Sum.Intensity', 'Transparency']

environmental_features = ['gdd2', 'WaterT', 'avgdepth', 'MinDepth', 'MaxDepth', 'CLOUD_PC', 'PRECIP', 
                          'distshore', 'Exposure', 'XANGLE', 'XWAVEHT']

sampling_features = ['SITE', 'Loc', 'LAT0', 'LAT1', 'LON0', 'LON1']

biological_features = ['WhitefishDen', 'UnknwCoregonine', 'CiscoDen']

sum_features = geometric_features + shape_features + optical_features + environmental_features + sampling_features + biological_features

sum_features_drop = [feature for feature in sum_features if feature not in ['SITE', 'Loc']]

In [4]:
file_path = "HURON_Predictor_Selection_Dataset.csv"
df = pd.read_csv(file_path)

## OLR

In [5]:
print("Total number of rows in the dataset:")
print(df.shape[0])

print("Missing values before cleaning:")
print(df.isna().sum())

Total number of rows in the dataset:
51429
Missing values before cleaning:
file_name                    0
Image.File                   0
Class                        0
Area..ABD.                   0
Area..Filled.                0
Diameter..ABD.               0
Diameter..ESD.               0
Diameter..FD.                0
Length                       0
Width                        0
Perimeter                    0
Volume..ABD.                 0
Volume..ESD.                 0
Geodesic.Length              0
Geodesic.Thickness           0
Aspect.Ratio                 0
Circle.Fit                   0
Circularity                  0
Circularity..Hu.             0
Compactness                  0
Convex.Perimeter             0
Convexity                    0
Fiber.Curl                   0
Fiber.Straightness           0
Geodesic.Aspect.Ratio        0
Intensity                    0
Roughness                    0
Elongation                   0
Symmetry                     0
Edge.Gradient             

In [6]:
rows_with_missing = df[df.isna().any(axis=1)]
print(f"Total # of rows with missing values: {len(rows_with_missing)}")

Total # of rows with missing values: 11968


In [6]:
df_ols = df.copy()

for col in ['SITE', 'Loc']:
    le = LabelEncoder()
    df_ols[col] = le.fit_transform(df_ols[col].astype(str))  

columns_to_check = ['distshore', 'Exposure', 'WhitefishDen', 'UnknwCoregonine', 'CiscoDen']
df_ols = df_ols.dropna(subset=columns_to_check)

selected_features = sum_features  
df_ols = df_ols[selected_features + ['Class']].copy()

df_ols['Class'] = df_ols['Class'].map({'Calanoid_1': 0, 'Cyclopoid_1': 1})

df_ols = df_ols.apply(pd.to_numeric, errors='coerce')

X = sm.add_constant(df_ols[selected_features]) 
y = df_ols['Class']

model = sm.OLS(y, X).fit()

summary_df = pd.DataFrame({
    'Variable': model.pvalues.index,
    'P-Value': model.pvalues.values,
    'Coefficient': model.params.values
})

print("\n OLS Regression Results:")
print(summary_df)

significant_vars = summary_df[summary_df['P-Value'] < 0.001]['Variable'].tolist()

if 'const' in significant_vars:
    significant_vars.remove('const')

print("\n Significant Variables (p-value < 0.001):", significant_vars)
print(f"Total Significant Variables: {len(significant_vars)}")



 OLS Regression Results:
                 Variable        P-Value   Coefficient
0                   const   1.299074e-01  3.718719e+01
1              Area..ABD.   1.374582e-01  1.480683e-06
2           Area..Filled.   5.646443e-12  6.394620e-06
3          Diameter..ABD.   2.487219e-04  4.083079e-03
4          Diameter..ESD.   9.159749e-01 -2.016792e-04
5           Diameter..FD.   1.115242e-05 -4.838105e-03
6                  Length   6.388202e-20 -5.295426e-04
7                   Width   5.256937e-56  5.050251e-04
8               Perimeter   3.589479e-01  9.530262e+00
9            Volume..ABD.  1.057555e-121 -5.558226e-09
10           Volume..ESD.   4.790181e-28  1.346118e-10
11        Geodesic.Length   3.589359e-01 -1.906100e+01
12     Geodesic.Thickness   3.587315e-01 -1.906911e+01
13           Aspect.Ratio  1.402704e-165 -4.785817e-01
14             Circle.Fit   3.959687e-02 -5.773860e-02
15            Circularity   6.941606e-24  2.778654e+00
16       Circularity..Hu.  2.964178e-12

## logistic

In [7]:
df_logit = df.copy()

columns_to_check = ['distshore', 'Exposure', 'WhitefishDen', 'UnknwCoregonine', 'CiscoDen']
df_logit = df_logit.dropna(subset=columns_to_check)

selected_features = sum_features_drop 
df_logit = df_logit[selected_features + ['Class']].copy()

df_logit['Class'] = df_logit['Class'].map({'Calanoid_1': 0, 'Cyclopoid_1': 1})

df_logit = df_logit.apply(pd.to_numeric, errors='coerce')

scaler = StandardScaler()
df_logit[selected_features] = scaler.fit_transform(df_logit[selected_features])

X = sm.add_constant(df_logit[selected_features]) 
y = df_logit['Class']

logit_model = sm.Logit(y, X).fit()

summary_df = pd.DataFrame({
    'Variable': logit_model.pvalues.index,
    'P-Value': logit_model.pvalues.values,
    'Coefficient': logit_model.params.values
})

print("\n Logistic Regression Results:")
print(summary_df)

# (p-value < 0.001）
significant_vars = summary_df[summary_df['P-Value'] < 0.001]['Variable'].tolist()

if 'const' in significant_vars:
    significant_vars.remove('const')

print("\n Significant Variables (p-value < 0.001):", significant_vars)
print(f"Total Significant Variables: {len(significant_vars)}")
columns_to_check = ['distshore', 'Exposure', 'WhitefishDen', 'UnknwCoregonine', 'CiscoDen']

         Current function value: 0.119571
         Iterations: 35

 Logistic Regression Results:
                 Variable        P-Value  Coefficient
0                   const   0.000000e+00    -6.075083
1              Area..ABD.   2.299928e-17   -48.618605
2           Area..Filled.   7.504119e-12    44.994187
3          Diameter..ABD.   9.039100e-22    72.084328
4          Diameter..ESD.   1.208122e-01   -22.689318
5           Diameter..FD.   2.486153e-13   -58.594099
6                  Length   1.155990e-01     1.711669
7                   Width   8.620319e-52     3.779505
8               Perimeter   9.999743e-01   -13.047619
9            Volume..ABD.   8.223156e-01    -0.370392
10           Volume..ESD.   2.663941e-03     2.748991
11        Geodesic.Length   9.999964e-01     1.867008
12     Geodesic.Thickness   9.997341e-01    -6.634801
13           Aspect.Ratio  4.687038e-125    -2.414126
14             Circle.Fit   5.220706e-03     0.196420
15            Circularity   1.002446e-1

