Week 4

In [14]:
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Statistical modeling
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [16]:
df = pd.read_csv("fetal_health.csv")
df.head()


Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [17]:
target = "fetal_health"

X = df.drop(columns=[target])
y = df[target]


In [18]:
if y.nunique() == 2:
    print("Binary Classification → Logistic Regression")
elif y.nunique() > 2:
    print("Multiclass Classification → Multinomial Logistic Regression")
else:
    print("Continuous Target → Linear Regression")


Multiclass Classification → Multinomial Logistic Regression


In [19]:
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)


Selected Features:
Index(['baseline value', 'accelerations', 'prolongued_decelerations',
       'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'histogram_mode', 'histogram_mean', 'histogram_median',
       'histogram_variance'],
      dtype='object')


In [20]:
hypotheses = {
    "H0_1": "There is no significant relationship between selected CTG features and fetal health.",
    "H1_1": "At least one CTG feature significantly affects fetal health classification."
}

for k, v in hypotheses.items():
    print(f"{k}: {v}")


H0_1: There is no significant relationship between selected CTG features and fetal health.
H1_1: At least one CTG feature significantly affects fetal health classification.


In [21]:
X_sm = sm.add_constant(df[selected_features])
y_sm = df[target]

logit_model = sm.MNLogit(y_sm, X_sm)
result = logit_model.fit()

print(result.summary())


Optimization terminated successfully.
         Current function value: 0.246172
         Iterations 15
                          MNLogit Regression Results                          
Dep. Variable:           fetal_health   No. Observations:                 2126
Model:                        MNLogit   Df Residuals:                     2104
Method:                           MLE   Df Model:                           20
Date:                Mon, 02 Feb 2026   Pseudo R-squ.:                  0.6354
Time:                        22:06:31   Log-Likelihood:                -523.36
converged:                       True   LL-Null:                       -1435.6
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                        fetal_health=2       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------
const              

In [23]:
summary = {
    "Model Selected": "Multinomial Logistic Regression",
    "Reason": "Target variable is categorical with more than two classes",
    "Feature Selection Method": "ANOVA F-test",
    "Hypothesis Testing": "p-values from logistic regression coefficients",
    "Evaluation Metric": "Accuracy, Precision, Recall, F1-score"
}

summary


{'Model Selected': 'Multinomial Logistic Regression',
 'Reason': 'Target variable is categorical with more than two classes',
 'Feature Selection Method': 'ANOVA F-test',
 'Hypothesis Testing': 'p-values from logistic regression coefficients',
 'Evaluation Metric': 'Accuracy, Precision, Recall, F1-score'}