**AI Usage**

*Tool:* Chat GPT (GPT-5)

*Purpose:* Debugged errors; explained AdaBoost and Soft Voting; Suggested formatting solutions

*Usage:* Modified code for AdaBoost and Soft Voting based on solutions for errors; Adopted code for getting the weights of each algorithm and how to implement it; modiefied code to change fetal health column to Normal, Suspect and Pathological

*Location:* Documented here and further comments in fetal_health.py

## **Import Libraries**

In [25]:
import pandas as pd                  # Pandas
import numpy as np                   # Numpy
from matplotlib import pyplot as plt # Matplotlib
import seaborn as sns                # Seaborn

# Package for data partitioning
from sklearn.model_selection import train_test_split

# Random Forest Classifier Libraries
import sklearn
from sklearn.ensemble import RandomForestClassifier

# Decision Tree Classifier Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree # Package to visualize Decision Tree

# Soft voting Classifier Libraries
from sklearn.svm import SVC                         
from sklearn.ensemble import VotingClassifier
# Package for generating F1 Score
from sklearn.metrics import f1_score


# Adaboost Classifier Libraries
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
# Package to record time
import time

# Package for generating confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Package for generating classification report
from sklearn.metrics import classification_report

# Module to save and load Python objects to and from files
import pickle 

%matplotlib inline

# Display inline plots as vector-based (svg)
%config InlineBackend.figure_formats = ['svg']

In [26]:
fetal_df = pd.read_csv('fetal_health.csv')
fetal_df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [27]:
# Dropping null values
fetal_df.dropna(inplace = True)
fetal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

In [28]:
fetal_df['fetal_health'] = fetal_df['fetal_health'].replace({
    1: 'Normal',
    2: 'Suspect',
    3: 'Pathological'
})
fetal_df

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,Suspect
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,Normal
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,Normal
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,Normal
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,Suspect
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,Suspect
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,Suspect
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,Suspect


In [29]:
# Distribution of Fetal Health column
fetal_df['fetal_health'].value_counts(normalize = True)

fetal_health
Normal          0.778457
Suspect         0.138758
Pathological    0.082785
Name: proportion, dtype: float64

**Selecting Input and Output Features**

In [30]:
output = fetal_df['fetal_health']

features = fetal_df.drop(columns = ['fetal_health'])

**Data Partitioning**

In [31]:
train_X, test_X, train_y, test_y = train_test_split(features, output, test_size = 0.2, random_state = 1) 

## **Random Forest**

In [32]:
# Defining prediction model
clf_rf = RandomForestClassifier(random_state = 0)

# Fitting model on training data
clf_rf.fit(train_X, train_y)

In [33]:
# Predictions on training set
y_pred_train = clf_rf.predict(train_X)

# Now generate confusion matrix
cm = confusion_matrix(train_y, y_pred_train, labels = clf_rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_rf.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'Greens', ax = ax);

In [34]:
# Predictions on test set
y_pred = clf_rf.predict(test_X)

# Now generate confusion matrix
cm = confusion_matrix(test_y, y_pred, labels = clf_rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_rf.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'Greens', ax = ax)

# Save as SVG
plt.savefig("rf_confusion_mat.svg", bbox_inches = 'tight');

In [35]:
report_rf = classification_report(test_y, y_pred, output_dict = True)
report_rf_df = pd.DataFrame(report_rf)
display(report_rf_df)

# Save the report as a CSV File
report_rf_df.to_csv('rf_class_report.csv') 

Unnamed: 0,Normal,Pathological,Suspect,accuracy,macro avg,weighted avg
precision,0.93913,0.870968,0.88,0.92723,0.896699,0.924572
recall,0.993865,0.84375,0.647059,0.92723,0.828225,0.92723
f1-score,0.965723,0.857143,0.745763,0.92723,0.856209,0.922456
support,326.0,32.0,68.0,0.92723,426.0,426.0


In [36]:
# Storing importance values from the trained model
importance_rf = clf_rf.feature_importances_

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance_rf)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

# Bar plot
plt.figure(figsize = (10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['black', 'green'])

plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Which features are the most important for species prediction?') 
plt.tight_layout()
plt.savefig("rf_feature_imp.svg");

**Save Pickle**

In [37]:
# Pickle file: saving the trained DT model
# Creating the file where we want to write the model (wb = write binary)
rf_pickle = open('random_forest_fetal.pickle', 'wb') 

# Write DT model to the file
pickle.dump(clf_rf, rf_pickle) 

# Close the file
rf_pickle.close() 

## **Decision Tree**

In [38]:
# Defining prediction model
clf_dt = DecisionTreeClassifier(random_state = 0)

# Fitting model on training data
clf_dt.fit(train_X, train_y)

In [39]:
# Predictions on training set
y_pred_train = clf_dt.predict(train_X)

# Now generate confusion matrix
cm = confusion_matrix(train_y, y_pred_train, labels = clf_dt.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_dt.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'Blues', ax = ax);

In [40]:
# Predictions on test set
y_pred = clf_dt.predict(test_X)

# Now generate confusion matrix
cm = confusion_matrix(test_y, y_pred, labels = clf_dt.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_dt.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'Blues', ax = ax)

# Save as SVG
plt.savefig("dt_confusion_mat.svg", bbox_inches = 'tight');

In [41]:
report_dt = classification_report(test_y, y_pred, output_dict = True)
report_dt_df = pd.DataFrame(report_dt)
display(report_dt_df)

# Save the report as a CSV File
report_dt_df.to_csv('dt_class_report.csv') 

Unnamed: 0,Normal,Pathological,Suspect,accuracy,macro avg,weighted avg
precision,0.93994,0.882353,0.779661,0.913146,0.867318,0.91003
recall,0.960123,0.9375,0.676471,0.913146,0.858031,0.913146
f1-score,0.949924,0.909091,0.724409,0.913146,0.861141,0.910859
support,326.0,32.0,68.0,0.913146,426.0,426.0


In [42]:
# Storing importance values from the trained model
importance_dt = clf_dt.feature_importances_

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance_dt)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

# Bar plot
plt.figure(figsize = (10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['black', 'blue'])

plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Which features are the most important for species prediction?') 
plt.tight_layout()
plt.savefig("dt_feature_imp.svg");

In [43]:
# Pickle file: saving the trained DT model
# Creating the file where we want to write the model (wb = write binary)
dt_pickle = open('decision_tree_fetal.pickle', 'wb') 

# Write DT model to the file
pickle.dump(clf_dt, dt_pickle) 

# Close the file
dt_pickle.close() 

## **AdaBoost**

In [44]:
# Defining prediction model
clf_ada = AdaBoostClassifier(algorithm = 'SAMME', random_state = 0)

# Fitting model on training data
clf_ada.fit(train_X, train_y)



In [45]:
# Predictions on training set
y_pred_train = clf_ada.predict(train_X)

# Now generate confusion matrix
cm = confusion_matrix(train_y, y_pred_train, labels = clf_ada.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_ada.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'Reds', ax = ax);

In [46]:
# Predictions on test set
y_pred = clf_ada.predict(test_X)

# Now generate confusion matrix
cm = confusion_matrix(test_y, y_pred, labels = clf_ada.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_ada.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'Reds', ax = ax)

# Save as SVG
plt.savefig("ada_confusion_mat.svg", bbox_inches = 'tight');

In [47]:
report_ada = classification_report(test_y, y_pred, output_dict = True)
report_ada_df = pd.DataFrame(report_ada)
display(report_ada_df)

# Save the report as a CSV File
report_ada_df.to_csv('ada_class_report.csv') 

Unnamed: 0,Normal,Pathological,Suspect,accuracy,macro avg,weighted avg
precision,0.909621,0.958333,0.694915,0.882629,0.85429,0.879008
recall,0.957055,0.71875,0.602941,0.882629,0.759582,0.882629
f1-score,0.932735,0.821429,0.645669,0.882629,0.799944,0.878552
support,326.0,32.0,68.0,0.882629,426.0,426.0


In [48]:
# Storing importance values from the trained model
importance_ada = clf_ada.feature_importances_

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance_ada)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

# Bar plot
plt.figure(figsize = (10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['black', 'red'])

plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Which features are the most important for species prediction?') 
plt.tight_layout()
plt.savefig("ada_feature_imp.svg");

**Save Pickle**

In [49]:
# Pickle file: saving the trained DT model
# Creating the file where we want to write the model (wb = write binary)
ada_pickle = open('AdaBoost_fetal.pickle', 'wb') 

# Write DT model to the file
pickle.dump(clf_ada, ada_pickle) 

# Close the file
ada_pickle.close() 

## **Soft Voting**

In [50]:
# Defining prediction model
clf_sv_iniital = VotingClassifier(estimators = [('rf', clf_rf), ('dt', clf_dt), ('ada', clf_ada)],
                            voting = 'soft', n_jobs = -1)



In [51]:
# Evaluate prediction performance on test data using F1 Score
weights = []
for clf in (clf_dt, clf_rf, clf_ada, clf_sv_iniital):
    clf.fit(train_X, train_y)
    y_pred = clf.predict(test_X)
    f1 = f1_score(test_y, y_pred, average = 'macro')
    weights.append(f1)
    print(clf.__class__.__name__, f1)

#normalizing weights
weights = np.array(weights) / np.sum(weights)
print(weights)

DecisionTreeClassifier 0.8611414951252213
RandomForestClassifier 0.8562094569318798
AdaBoostClassifier 0.7999444295920409




VotingClassifier 0.8611414951252213
[0.25489347 0.25343361 0.23677945 0.25489347]


In [52]:
clf_sv = VotingClassifier(estimators = [('rf', clf_rf), ('dt', clf_dt), ('ada', clf_ada), ('sv',clf_sv_iniital)],
                            voting = 'soft', n_jobs = -1, weights = weights)

# Fitting model on training data
clf_sv.fit(train_X, train_y)


In [53]:
# Predictions on training set
y_pred_train = clf_sv.predict(train_X)

# Now generate confusion matrix
cm = confusion_matrix(train_y, y_pred_train, labels = clf_ada.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_ada.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'PuOr', ax = ax);

In [54]:
# Predictions on test set
y_pred = clf_sv.predict(test_X)

# Now generate confusion matrix
cm = confusion_matrix(test_y, y_pred, labels = clf_sv.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf_sv.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'PuOr', ax = ax)

# Save as SVG
plt.savefig("sv_confusion_mat.svg", bbox_inches = 'tight');

In [55]:
report_sv = classification_report(test_y, y_pred, output_dict = True)
report_sv_df = pd.DataFrame(report_sv)
display(report_sv_df)

# Save the report as a CSV File
report_sv_df.to_csv('sv_class_report.csv') 

Unnamed: 0,Normal,Pathological,Suspect,accuracy,macro avg,weighted avg
precision,0.93994,0.882353,0.779661,0.913146,0.867318,0.91003
recall,0.960123,0.9375,0.676471,0.913146,0.858031,0.913146
f1-score,0.949924,0.909091,0.724409,0.913146,0.861141,0.910859
support,326.0,32.0,68.0,0.913146,426.0,426.0


In [56]:
# Storing importance values from the trained model
# Used ChatGPT to figure out normalization of the ada, rf, and dt feature importances
def normal_importance(clf):
    return clf / np.sum(clf)

clf_rf_normal = normal_importance(importance_rf)
clf_dt_normal = normal_importance(importance_dt)
clf_ada_normal = normal_importance(importance_ada)

importance_sv = weights[0] * clf_dt_normal + weights[1] * clf_rf_normal + weights[2] * clf_ada_normal

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance_sv)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

# Bar plot
plt.figure(figsize = (10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['purple', 'orange'])

plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Which features are the most important for species prediction?') 
plt.tight_layout()
plt.savefig("sv_feature_imp.svg");

In [57]:
# Pickle file: saving the trained DT model
# Creating the file where we want to write the model (wb = write binary)
sv_pickle = open('soft_voting_fetal.pickle', 'wb') 

# Write DT model to the file
pickle.dump(clf_sv, sv_pickle) 

# Close the file
sv_pickle.close() 