In [1]:
!pip install -U scikit-learn



In [2]:
import pandas as pd                  # Pandas
import numpy as np                   # Numpy
from matplotlib import pyplot as plt # Matplotlib
import seaborn as sns                # Seaborn

# Package to implement Decision Tree Model
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
## new imports from tutorials linked in canvas

# Import MAPIE to calculate prediction intervals
from mapie.regression import MapieRegressor

# To calculate coverage score
from mapie.metrics import regression_coverage_score

# Package for data partitioning
from sklearn.model_selection import train_test_split

# Package to visualize Decision Tree
from sklearn import tree

# Package for generating confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Package for generating classification report
from sklearn.metrics import classification_report

# Module to save and load Python objects to and from files
import pickle 

%matplotlib inline

# Display inline plots as vector-based (svg)
%config InlineBackend.figure_formats = ['svg']

In [3]:
df = pd.read_csv('fetal_health.csv')
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [4]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

In [5]:
df['fetal_health'].value_counts(normalize = True)

fetal_health
1.0    0.778457
2.0    0.138758
3.0    0.082785
Name: proportion, dtype: float64

In [6]:
output = df['fetal_health']
features = df.drop(columns=['fetal_health'])

In [7]:
train_X, test_X, train_y, test_y = train_test_split(features, output, test_size = 0.2, random_state = 1) 

In [9]:
clf = DecisionTreeClassifier(random_state=0)

clf.fit(train_X, train_y)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [10]:
# Predictions on training set
y_pred_train = clf.predict(train_X)

# Now generate confusion matrix
cm = confusion_matrix(train_y, y_pred_train, labels = clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'PuBu', ax = ax);

In [11]:
# Predictions on test set
y_pred = clf.predict(test_X)

# Now generate confusion matrix
cm = confusion_matrix(test_y, y_pred, labels = clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf.classes_)

# Specify figure size
fig, ax = plt.subplots(figsize = (5, 5))
plt.rcParams.update({'font.size': 12})

# Display Confusion Matrix
disp.plot(cmap = 'PuRd', ax = ax)

# Save as SVG
plt.savefig("confusion_mat.svg", bbox_inches = 'tight');

In [13]:
report = classification_report(test_y, y_pred, output_dict = True)
report_df = pd.DataFrame(report)
report_df

# Save the report as a CSV File
report_df.to_csv('class_report.csv') 
plt.savefig("class_report.svg", bbox_inches = 'tight');

In [14]:
# Storing importance values from the trained model
importance = clf.feature_importances_

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

# Bar plot
plt.figure(figsize = (10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['purple', 'pink'])

plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Which features are the most important for species prediction?') 
plt.tight_layout()
plt.savefig("feature_imp.svg");

In [16]:
# Pickle file: saving the trained DT model
# Creating the file where we want to write the model
dt_pickle = open('fetal_dt.pickle', 'wb') 

# Write DT model to the file
pickle.dump(clf, dt_pickle) 

# Close the file
dt_pickle.close() 

In [25]:
## RANDOM FOREST
random_forest_clf = RandomForestClassifier(random_state = 0)
random_forest_clf.fit(train_X, train_y)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
y_pred_train = random_forest_clf.predict(train_X)

cm = confusion_matrix(train_y, y_pred_train, labels = random_forest_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = random_forest_clf.classes_)
fig, ax = plt.subplots(figsize = (5,5))
plt.rcParams.update({'font.size': 12})

disp.plot(cmap = 'PuBu', ax = ax)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x13ef44770>

In [27]:
## CONFUSTION MATRIX RANDOM FOREST

y_pred = random_forest_clf.predict(test_X)  

cm = confusion_matrix(test_y, y_pred, labels = random_forest_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = random_forest_clf.classes_)
fig, ax = plt.subplots(figsize = (5,5))
plt.rcParams.update({'font.size': 12})  

disp.plot(cmap = 'PuRd', ax = ax)
plt.savefig("rf_confusion_mat.svg", bbox_inches = 'tight');



In [28]:
rf_report = classification_report(test_y, y_pred, output_dict = True)
rf_report_df = pd.DataFrame(rf_report)
rf_report_df

rf_report_df.to_csv('rf_class_report.csv')
plt.savefig("rf_class_report.svg", bbox_inches = 'tight');

In [29]:
rf_importance = random_forest_clf.feature_importances_
rf_feature_imp = pd.DataFrame(list(zip(train_X.columns, rf_importance)),
               columns = ['Feature', 'Importance'])
rf_feature_imp = rf_feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

plt.figure(figsize = (10, 5))
plt.barh(rf_feature_imp['Feature'], rf_feature_imp['Importance'], color = ['purple', 'pink'])
plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Which features are the most important for species prediction?') 
plt.tight_layout()
plt.savefig("rf_feature_imp.svg");

In [30]:
## SAVING RF PICKLE
rf_pickle = open('rf_fetal.pickle', 'wb')
pickle.dump(random_forest_clf, rf_pickle)
rf_pickle.close()


In [8]:
ada_clf = AdaBoostClassifier(random_state = 0)
ada_clf.fit(train_X, train_y)

0,1,2
,estimator,
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,0


In [9]:
y_pred_train = ada_clf.predict(train_X)

cm = confusion_matrix(train_y, y_pred_train, labels = ada_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ada_clf.classes_)
fig, ax = plt.subplots(figsize = (5,5))
plt.rcParams.update({'font.size': 12})

disp.plot(cmap = 'PuBu', ax = ax)


<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x13e4ef4d0>

In [10]:
y_pred = ada_clf.predict(test_X)  

cm = confusion_matrix(test_y, y_pred, labels = ada_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ada_clf.classes_)
fig, ax = plt.subplots(figsize = (5,5))
plt.rcParams.update({'font.size': 12})  

disp.plot(cmap = 'PuRd', ax = ax)
plt.savefig("ada_confusion_mat.svg", bbox_inches = 'tight');


In [11]:
ada_report = classification_report(test_y, y_pred, output_dict = True)
ada_report_df = pd.DataFrame(ada_report)
ada_report_df
ada_report_df.to_csv('ada_class_report.csv')
plt.savefig("ada_class_report.svg", bbox_inches = 'tight');

In [12]:
ada_importance = ada_clf.feature_importances_
ada_feature_imp = pd.DataFrame(list(zip(train_X.columns, ada_importance)),
               columns = ['Feature', 'Importance'])
ada_feature_imp = ada_feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

plt.figure(figsize = (10, 5))
plt.barh(ada_feature_imp['Feature'], ada_feature_imp['Importance'], color = ['purple', 'pink'])
plt.xlabel("Importance")
plt.ylabel("Input Feature")
plt.title('Which features are the most important for prediction?') 
plt.tight_layout()
plt.savefig("ada_feature_imp.svg");

In [13]:
ada_pickle = open('ada_fetal.pickle', 'wb')
pickle.dump(ada_clf, ada_pickle)
ada_pickle.close()

In [14]:
from sklearn.metrics import f1_score

rf = RandomForestClassifier(random_state=0)
dt = DecisionTreeClassifier(random_state=0)
ada = AdaBoostClassifier(random_state=0)

In [15]:
models = {'RandomForest': rf, 'DecisionTree': dt, 'AdaBoost': ada}
f1_scores = {}

for name, model in models.items():
    model.fit(train_X, train_y)
    y_pred = model.predict(test_X)
    f1 = f1_score(test_y, y_pred, average='macro')
    f1_scores[name] = f1

f1_scores_df = pd.DataFrame(list(f1_scores.items()), columns=['Model', 'F1_Macro'])
f1_scores_df

Unnamed: 0,Model,F1_Macro
0,RandomForest,0.856209
1,DecisionTree,0.861141
2,AdaBoost,0.799944


In [16]:
f1_values = np.array(list(f1_scores.values()))
normalized_weights = f1_values / np.sum(f1_values)

weights_dict = dict(zip(f1_scores.keys(), normalized_weights))
weights_dict

{'RandomForest': np.float64(0.340130706620117),
 'DecisionTree': np.float64(0.34208996743205655),
 'AdaBoost': np.float64(0.3177793259478265)}

In [17]:
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('dt', dt),
        ('ada', ada)
    ],
    voting='soft',
    weights=normalized_weights
)

voting_clf.fit(train_X, train_y)

0,1,2
,estimators,"[('rf', ...), ('dt', ...), ...]"
,voting,'soft'
,weights,"array([0.3401..., 0.31777933])"
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,estimator,
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,0


In [18]:
y_pred_train = voting_clf.predict(train_X)

cm = confusion_matrix(train_y, y_pred_train, labels=voting_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=voting_clf.classes_)
fig, ax = plt.subplots(figsize=(5,5))
plt.rcParams.update({'font.size': 12})
disp.plot(cmap='PuBu', ax=ax)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x13ed83d90>

In [19]:
y_pred = voting_clf.predict(test_X)

cm = confusion_matrix(test_y, y_pred, labels=voting_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=voting_clf.classes_)
fig, ax = plt.subplots(figsize=(5,5))
plt.rcParams.update({'font.size': 12})
disp.plot(cmap='PuRd', ax=ax)
plt.savefig("soft_vote_confusion_mat.svg", bbox_inches='tight');

In [31]:
voting_report = classification_report(test_y, y_pred, output_dict=True)
voting_report_df = pd.DataFrame(voting_report)
voting_report_df.to_csv('soft_vote_class_report.csv')
voting_report_df

plt.savefig("soft_vote_class_report.svg", bbox_inches = 'tight');

In [21]:
rf_imp = rf.feature_importances_
dt_imp = dt.feature_importances_
ada_imp = ada.feature_importances_

In [22]:
all_importances = np.vstack([rf_imp, dt_imp, ada_imp])

In [23]:
weighted_importance = np.average(all_importances, axis=0, weights=normalized_weights)

feature_imp_df = pd.DataFrame({
    'Feature': train_X.columns,
    'Weighted_Importance': weighted_importance
}).sort_values('Weighted_Importance', ascending=False).reset_index(drop=True)

plt.figure(figsize=(10,5))
plt.barh(feature_imp_df['Feature'], feature_imp_df['Weighted_Importance'], color=['purple','pink'])
plt.xlabel("Weighted Importance")
plt.ylabel("Input Feature")
plt.title("Aggregated Feature Importance (Soft Voting Classifier)")
plt.tight_layout()
plt.savefig("soft_vote_feature_imp.svg");

In [24]:
voting_pickle = open('soft_voting_fetal.pickle', 'wb')
pickle.dump(voting_clf, voting_pickle)
voting_pickle.close()