In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
target_classes = ["Calanoid_1", "Cyclopoid_1"]

In [3]:
geometric_features = ['Area..ABD.', 'Area..Filled.', 'Diameter..ABD.', 'Diameter..ESD.', 'Diameter..FD.',
                      'Length','Width', 'Perimeter', 'Volume..ABD.', 'Volume..ESD.', 'Geodesic.Length', 
                      'Geodesic.Thickness']

shape_features = ['Aspect.Ratio', 'Circle.Fit', 'Circularity', 'Circularity..Hu.', 'Compactness', 
                  'Convex.Perimeter', 'Convexity', 'Fiber.Curl', 'Fiber.Straightness', 
                  'Geodesic.Aspect.Ratio', 'Roughness', 'Elongation', 'Symmetry']

optical_features = ['Edge.Gradient', 'Intensity','Sigma.Intensity', 'Sum.Intensity', 'Transparency']

environmental_features = ['gdd2', 'WaterT', 'avgdepth', 'MinDepth', 'MaxDepth', 'CLOUD_PC', 'PRECIP', 
                          'distshore', 'Exposure', 'XANGLE', 'XWAVEHT']

sampling_features = ['SITE', 'Loc', 'LAT0', 'LAT1', 'LON0', 'LON1']

biological_features = ['WhitefishDen', 'UnknwCoregonine', 'CiscoDen']

sum_features = geometric_features + shape_features + optical_features + environmental_features + sampling_features + biological_features

sum_features_drop = [feature for feature in sum_features if feature not in ['SITE', 'Loc']]

In [4]:
file_path = "SIMC_Predictor_Selection_Dataset.csv"
df = pd.read_csv(file_path)

## OLR

In [5]:
print("Total number of rows in the dataset:")
print(df.shape[0])

print("Missing values before cleaning:")
print(df.isna().sum())

Total number of rows in the dataset:
390530
Missing values before cleaning:
file_name                     0
Image.File                    0
Class                         0
Area..ABD.                    0
Area..Filled.                 0
Diameter..ABD.                0
Diameter..ESD.                0
Diameter..FD.                 0
Length                        0
Width                         0
Perimeter                     0
Volume..ABD.                  0
Volume..ESD.                  0
Geodesic.Length               0
Geodesic.Thickness            0
Aspect.Ratio                  0
Circle.Fit                    0
Circularity                   0
Circularity..Hu.              0
Compactness                   0
Convex.Perimeter              0
Convexity                     0
Fiber.Curl                    0
Fiber.Straightness            0
Geodesic.Aspect.Ratio         0
Intensity                     0
Roughness                     0
Elongation                    0
Symmetry                    

In [8]:
filtered_df = df.drop(columns=['SmeltDen'], errors='ignore')  
rows_with_missing = df[filtered_df.isna().any(axis=1)]
print(f"Total # of rows with missing values: {len(rows_with_missing)}")

Total # of rows with missing values: 56620


In [17]:
df_ols = df.copy()

for col in ['SITE', 'Loc']:
    le = LabelEncoder()
    df_ols[col] = le.fit_transform(df_ols[col].astype(str))  

columns_to_check = ['distshore', 'Exposure', 'WhitefishDen', 'UnknwCoregonine', 'CiscoDen']
df_ols = df_ols.dropna(subset=columns_to_check)

selected_features = sum_features  
df_ols = df_ols[selected_features + ['Class']].copy()

df_ols['Class'] = df_ols['Class'].map({'Calanoid_1': 0, 'Cyclopoid_1': 1})

df_ols = df_ols.apply(pd.to_numeric, errors='coerce')

X = sm.add_constant(df_ols[selected_features]) 
y = df_ols['Class']

model = sm.OLS(y, X).fit()

summary_df = pd.DataFrame({
    'Variable': model.pvalues.index,
    'P-Value': model.pvalues.values,
    'Coefficient': model.params.values
})

print("\n OLS Regression Results:")
print(summary_df)

significant_vars = summary_df[summary_df['P-Value'] < 0.001]['Variable'].tolist()

if 'const' in significant_vars:
    significant_vars.remove('const')

print("\n Significant Variables (p-value < 0.001):", significant_vars)
print(f"Total Significant Variables: {len(significant_vars)}")



 OLS Regression Results:
                 Variable        P-Value   Coefficient
0                   const   6.715162e-83  1.219659e+02
1              Area..ABD.  4.830084e-245 -4.253469e-05
2           Area..Filled.   0.000000e+00  4.723524e-05
3          Diameter..ABD.   0.000000e+00  6.302844e-02
4          Diameter..ESD.   3.319862e-40 -1.200078e-02
5           Diameter..FD.   0.000000e+00 -4.920837e-02
6                  Length  3.452565e-302  1.300382e-03
7                   Width   0.000000e+00  1.395899e-03
8               Perimeter   2.570882e-01  7.434256e+00
9            Volume..ABD.   2.011114e-27 -4.974134e-09
10           Volume..ESD.   0.000000e+00  1.018124e-09
11        Geodesic.Length   2.570645e-01 -1.486926e+01
12     Geodesic.Thickness   2.563755e-01 -1.489081e+01
13           Aspect.Ratio   0.000000e+00 -9.365297e-01
14             Circle.Fit  5.158420e-246  4.805245e-01
15            Circularity   0.000000e+00  8.845465e+00
16       Circularity..Hu.   0.000000e+0

## logistic

In [9]:
df_logit = df.copy()

columns_to_check = ['distshore', 'Exposure', 'WhitefishDen', 'UnknwCoregonine', 'CiscoDen']
df_logit = df_logit.dropna(subset=columns_to_check)

selected_features = sum_features_drop 
df_logit = df_logit[selected_features + ['Class']].copy()

df_logit['Class'] = df_logit['Class'].map({'Calanoid_1': 0, 'Cyclopoid_1': 1})

df_logit = df_logit.apply(pd.to_numeric, errors='coerce')

scaler = StandardScaler()
df_logit[selected_features] = scaler.fit_transform(df_logit[selected_features])

X = sm.add_constant(df_logit[selected_features]) 
y = df_logit['Class']

logit_model = sm.Logit(y, X).fit()

summary_df = pd.DataFrame({
    'Variable': logit_model.pvalues.index,
    'P-Value': logit_model.pvalues.values,
    'Coefficient': logit_model.params.values
})

print("\n Logistic Regression Results:")
print(summary_df)

# (p-value < 0.001）
significant_vars = summary_df[summary_df['P-Value'] < 0.001]['Variable'].tolist()

if 'const' in significant_vars:
    significant_vars.remove('const')

print("\n Significant Variables (p-value < 0.001):", significant_vars)
print(f"Total Significant Variables: {len(significant_vars)}")
columns_to_check = ['distshore', 'Exposure', 'WhitefishDen', 'UnknwCoregonine', 'CiscoDen']

  return 1/(1+np.exp(-X))


         Current function value: 0.366169
         Iterations: 35

 Logistic Regression Results:
                 Variable        P-Value    Coefficient
0                   const   0.000000e+00      -1.852552
1              Area..ABD.  1.017143e-122     -49.885186
2           Area..Filled.  6.881694e-136      53.844753
3          Diameter..ABD.   0.000000e+00     125.420018
4          Diameter..ESD.   6.603391e-92     -58.462121
5           Diameter..FD.   0.000000e+00    -127.868933
6                  Length   0.000000e+00      11.513561
7                   Width   0.000000e+00       3.663825
8               Perimeter   9.999829e-01       2.866852
9            Volume..ABD.   1.786725e-06      -1.126609
10           Volume..ESD.   3.757613e-06       0.803518
11        Geodesic.Length   9.999714e-01       4.874243
12     Geodesic.Thickness   9.997696e-01      -2.125587
13           Aspect.Ratio   0.000000e+00      -3.032815
14             Circle.Fit   1.525238e-57       0.312231
15     

