In [146]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [None]:
df = pd.read_excel('/content/CTG.xls', sheet_name='Data', header = 1)
df

In [160]:
df = pd.read_excel('/content/CTG.xls', sheet_name='Data', header = 1)
df = df.iloc[:, list(range(0, 9)) + list(range(10, 31)) + [45]]
df = df.drop([2126, 2127, 2128])

# Assign new column names
df.columns = ['b', 'e', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'DR', 'LB', 'UC.1', 'AC.1', 'FM.1', 'DL.1', 'DS.1', 'DP.1', 'ASTV','MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']

# Handle potential duplicate column names like UC AC and other
cols = pd.Series(df.columns)
for dup in cols[cols.duplicated()].unique():
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
df.columns = cols

df = df.drop_duplicates()
df

Unnamed: 0,b,e,AC,FM,UC,DL,DS,DP,DR,LB,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,240.0,357.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,5.0,632.0,4.0,0.0,4.0,2.0,0.0,0.0,0.0,132.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,177.0,779.0,2.0,0.0,5.0,2.0,0.0,0.0,0.0,133.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,411.0,1192.0,2.0,0.0,6.0,2.0,0.0,0.0,0.0,134.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,533.0,1147.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,132.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,2059.0,2867.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,140.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,1576.0,2867.0,1.0,0.0,9.0,0.0,0.0,0.0,0.0,140.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,1576.0,2596.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,140.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,1576.0,3049.0,1.0,0.0,9.0,0.0,0.0,0.0,0.0,140.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0


**Downloading cleaned data file as excel**

In [162]:

df.to_excel("cleaned_data.xlsx", index=False)


In [None]:
print(len(df[df['NSP']== 1]))
print(len(df[df['NSP']== 2]))
print(len(df[df['NSP']== 3]))

In [None]:
chart_label = ['b', 'e', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'DR', 'LB', 'UC.1', 'AC.1', 'FM.1', 'DL.1', 'DS.1', 'DP.1', 'ASTV','MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']
for chart_n in range(30):
  chart_nsp1 = df[df['NSP'] == 1][chart_label[chart_n]]
  chart_nsp2 = df[df['NSP'] == 2][chart_label[chart_n]]
  chart_nsp3 = df[df['NSP'] == 3][chart_label[chart_n]]
  # Plot histograms
  plt.hist(chart_nsp1, bins=40, alpha=0.6, label='NSP = 1', color='blue')
  plt.hist(chart_nsp2, bins=40, alpha=0.7, label='NSP = 2', color='purple')
  plt.hist(chart_nsp3, bins=40, alpha=0.5, label='NSP = 3', color='red')

  plt.title(chart_label[chart_n] + ' Distribution: NSP 1 vs NSP 2 vs NSP 3')
  plt.xlabel(chart_label[chart_n] + ' Value')
  plt.ylabel('Frequency')
  plt.legend()
  plt.show()

In [None]:
chart_label = ['b', 'e', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'DR', 'LB', 'UC.1', 'AC.1', 'FM.1', 'DL.1', 'DS.1', 'DP.1', 'ASTV','MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']
for n in range(30):
  lb_bins = pd.cut(df[chart_label[n]], bins=15)
  heatmap_data = df.groupby([lb_bins, "NSP"]).size().unstack(fill_value=0)
  plt.figure(figsize=(10, 6))
  sns.heatmap(heatmap_data, annot=True, fmt="d", cmap="YlOrRd")
  plt.title(chart_label[n] +" distribution across NSP levels")
  plt.xlabel("NSP")
  plt.ylabel(chart_label[n] +"bins")
  plt.tight_layout()
  plt.show()


# **Chart after balanced on nsp 1 2 3**

In [None]:
from sklearn.utils import resample
print(df['NSP'].value_counts())

In [None]:
df_nsp1 = df[df['NSP'] == 1]
df_nsp2 = df[df['NSP'] == 2]
df_nsp3 = df[df['NSP'] == 3]

df_nsp2_upsampled = resample(df_nsp2, replace=True, n_samples=len(df_nsp1), random_state=42)
df_nsp3_upsampled = resample(df_nsp3, replace=True, n_samples=len(df_nsp1), random_state=42)

df_balanced = pd.concat([df_nsp1, df_nsp2_upsampled, df_nsp3_upsampled])


In [None]:
df_balanced

**Histogram**

In [None]:
chart_label = ['b', 'e', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'DR', 'LB', 'UC.1', 'AC.1', 'FM.1', 'DL.1', 'DS.1', 'DP.1', 'ASTV','MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']
for chart_n in range(30):
  chart_nsp1 = df_balanced[df_balanced['NSP'] == 1][chart_label[chart_n]]
  chart_nsp2 = df_balanced[df_balanced['NSP'] == 2][chart_label[chart_n]]
  chart_nsp3 = df_balanced[df_balanced['NSP'] == 3][chart_label[chart_n]]
  # Plot histograms
  plt.hist(chart_nsp1, bins=40, alpha=0.6, label='NSP = 1', color='blue')
  plt.hist(chart_nsp2, bins=40, alpha=0.7, label='NSP = 2', color='purple')
  plt.hist(chart_nsp3, bins=40, alpha=0.5, label='NSP = 3', color='red')

  plt.title(chart_label[chart_n] + ' Distribution: NSP 1 vs NSP 2 vs NSP 3')
  plt.xlabel(chart_label[chart_n] + ' Value')
  plt.ylabel('Frequency')
  plt.legend()
  plt.show()

**Heat Map**

In [None]:
chart_label = ['b', 'e', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'DR', 'LB', 'UC.1', 'AC.1', 'FM.1', 'DL.1', 'DS.1', 'DP.1', 'ASTV','MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']
for n in range(30):
  lb_bins = pd.cut(df_balanced[chart_label[n]], bins=15)
  heatmap_data = df_balanced.groupby([lb_bins, "NSP"]).size().unstack(fill_value=0)
  plt.figure(figsize=(10, 6))
  sns.heatmap(heatmap_data, annot=True, fmt="d", cmap="YlOrRd")
  plt.title(chart_label[n] +"distribution across NSP levels")
  plt.xlabel("NSP")
  plt.ylabel(chart_label[n] +"bins")
  plt.tight_layout()
  plt.show()



#**Splitting, Scaling, & normalizing**





In [None]:
#splitting the data into train and test
train, test = np.split(df.sample(frac = 1), [int(0.7 * len(df))])

print(len(train[train['NSP']== 1]))
print(len(train[train['NSP']== 2]))
print(len(train[train['NSP']== 3]))

In [None]:
def scale_dataset(dataframe, oversample = False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  # Convert y to numeric, coercing errors to NaN
  y = pd.to_numeric(y, errors='coerce')

  # Drop rows where y is NaN
  nan_mask = np.isnan(y)
  X = X[~nan_mask]
  y = y[~nan_mask]

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  data = np.hstack((X, np.reshape(y, (-1,1))))

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)
    data = np.hstack((X, np.reshape(y, (-1,1)))) # Recreate data with oversampled data


  return data, X, y

In [None]:
train, X_train, y_train = scale_dataset(train, oversample = True)
# valid, X_valid, y_valid = scale_dataset(valid, oversample = False)
test, X_test, y_test = scale_dataset(test, oversample = False)


# **Analysing based on certain feature**

In [None]:
df

In [None]:
df_stats = df.describe().T
df_stats

In [None]:
#distribution of the NSP

print(len(df[df['NSP']== 1]))
print(len(df[df['NSP']== 2]))
print(len(df[df['NSP']== 3]))

In [None]:
comparison_table = df.groupby('NSP')[['AC', 'DS']].agg(['mean', 'median'])
comparison_table.columns = ['.'.join(col) for col in comparison_table.columns]
comparison_table



LB


In [None]:
def categorize_lb(lb):
    if lb < 110:
        return 'Low'
    elif lb >= 110 and lb < 160:
        return 'Normal'
    else:
        return 'High'

df['LB_category'] = df['LB'].apply(categorize_lb)

comparison_table = df.groupby(['LB_category', 'NSP']).size().unstack(fill_value=0)
comparison_table

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot mSTV histogram + KDE
sns.histplot(data=df, x='MSTV', hue='NSP', multiple='stack', bins=30, ax=axes[0], kde=True)
axes[0].set_title('Distribution of mSTV by NSP')
axes[0].set_xlabel('mSTV')
axes[0].set_ylabel('Frequency')

# Plot mLTV histogram + KDE
sns.histplot(data=df, x='MLTV', hue='NSP', multiple='stack', bins=30, ax=axes[1], kde=True)
axes[1].set_title('Distribution of mLTV by NSP')
axes[1].set_xlabel('mLTV')

plt.tight_layout()
plt.show()



In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot mSTV histogram + KDE
sns.histplot(data=df, x='ASTV', hue='NSP', multiple='stack', bins=30, ax=axes[0], kde=True)
axes[0].set_title('Distribution of aSTV by NSP')
axes[0].set_xlabel('aSTV')
axes[0].set_ylabel('Frequency')

# Plot mLTV histogram + KDE
sns.histplot(data=df, x='ALTV', hue='NSP', multiple='stack', bins=30, ax=axes[1], kde=True)
axes[1].set_title('Distribution of aLTV by NSP')
axes[1].set_xlabel('aLTV')

plt.tight_layout()
plt.show()

Finding whether each feature has some significant impact on showing nsp level

In [None]:
from scipy.stats import chi2_contingency


In [None]:
df_binned = df.copy()
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]) and col != 'NSP':
        df_binned[col] = pd.cut(df[col], bins=3, labels=['Low', 'Medium', 'High'])

results = []

for col in df_binned.columns:
    if col != 'NSP':
        contingency = pd.crosstab(df_binned[col], df_binned['NSP'])
        chi2, p, dof, expected = chi2_contingency(contingency)
        results.append({
            'Feature': col,
            'Chi2': round(chi2, 3),
            'p-value': round(p, 4),
            'Significant': p < 0.1
        })

# Convert to DataFrame for display
chi_square_results = pd.DataFrame(results)
print(chi_square_results.sort_values('p-value'))

In [None]:
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=0.5)

plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()


In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

feature_names = df.columns[:-2]
X = df[feature_names].values

X_scaled = MinMaxScaler().fit_transform(X)
y = df['NSP']


selector = SelectKBest(score_func=chi2, k=10)
X_selected = selector.fit_transform(X_scaled, y)

selected_features = feature_names[selector.get_support()]
print("Selected features:", selected_features.tolist())

In [None]:
scores = selector.scores_
chi2_df = pd.DataFrame({'Feature': feature_names, 'Chi2 Score': scores})
chi2_df = chi2_df.sort_values(by='Chi2 Score', ascending=False)
print(chi2_df)

# **Modeling prediction**
**(With and without feature selection)**

In [None]:
selected_features = ['b', 'e', 'AC', 'FM', 'UC', 'DL', 'DP', 'LB', 'UC.1', 'AC.1', 'FM.1', 'DL.1', 'DS.1', 'DP.1', 'ASTV','MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency']

df_feature = df[selected_features].copy()
df_feature = df[selected_features + ['NSP']].copy()
df_feature

**Splitting the selected features**

In [None]:
ftrain, ftest = train_test_split(df_feature, test_size=0.3, stratify=df['NSP'], random_state=42)
ftrain, Xf_train, yf_train = scale_dataset(ftrain, oversample = True)
ftest, Xf_test, yf_test = scale_dataset(ftest, oversample = False)

**K NEAREST NEIGHBOURS**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

y_true_bin = np.where(np.isin(y_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(y_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(y_test))

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(Xf_train, yf_train)

yf_pred = knn_model.predict(Xf_test)
print(classification_report(yf_test, yf_pred))

y_true_bin = np.where(np.isin(yf_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(yf_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(yf_test))

**Log Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

y_true_bin = np.where(np.isin(y_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(y_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(y_test))

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(Xf_train, yf_train)
yf_pred = lg_model.predict(Xf_test)
print(classification_report(yf_test, yf_pred))

y_true_bin = np.where(np.isin(yf_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(yf_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(yf_test))



**Support Veector Machine**

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

y_true_bin = np.where(np.isin(y_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(y_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(y_test))

In [None]:
svm_model = LogisticRegression()
svm_model = svm_model.fit(Xf_train, yf_train)
yf_pred = svm_model.predict(Xf_test)
print(classification_report(yf_test, yf_pred))

y_true_bin = np.where(np.isin(yf_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(yf_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(yf_test))

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

y_true_bin = np.where(np.isin(y_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(y_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(y_test))

In [None]:
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(Xf_train, yf_train)
yf_pred = rf_model.predict(Xf_test)
print(classification_report(yf_test, yf_pred))

y_true_bin = np.where(np.isin(yf_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(yf_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(yf_test))

**Neural Nets**

In [None]:
#splitting the data into 3 types for training, validation, and testing
train, valid, test = np.split(df.sample(frac = 1), [int(0.6 * len(df)), int(0.8 * len(df))])

print(len(train[train['NSP']== 1]))
print(len(train[train['NSP']== 2]))
print(len(train[train['NSP']== 3]))

train, X_train, y_train = scale_dataset(train, oversample = True)
valid, X_valid, y_valid = scale_dataset(valid, oversample = False)
test, X_test, y_test = scale_dataset(test, oversample = False)

In [None]:
def plot_history(history):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
  ax1.plot(history.history['loss'], label='loss')
  ax1.plot(history.history['val_loss'], label='val_loss')
  ax1.set_xlabel('Epoch')
  ax1.set_ylabel('sparse_categorical_crossentropy')
  ax1.grid(True)

  ax2.plot(history.history['accuracy'], label='accuracy')
  ax2.plot(history.history['val_accuracy'], label='val_accuracy')
  ax2.set_xlabel('Epoch')
  ax2.set_ylabel('Accuracy')
  ax2.grid(True)

  plt.show()

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
  nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(dropout_prob),
    tf.keras.layers.Dense(num_nodes, activation='relu'),
    tf.keras.layers.Dropout(dropout_prob),
    tf.keras.layers.Dense(3, activation='softmax') # Changed to 3 output nodes and softmax activation
  ])

  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  history = nn_model.fit(X_train, y_train-1, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose = 0)

  return nn_model, history

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs = 100
for num_nodes in [16, 32, 64]:
  for dropout_prob in [0, 0.2]:
    for lr in [0.1, 0.005, 0.001]:
      for batch_size in [32, 64, 128]:
        print(f'num_nodes: {num_nodes}, dropout_prob: {dropout_prob}, lr: {lr}, batch_size: {batch_size}')
        model, history = train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
        plot_history(history)
        val_loss = model.evaluate(X_valid, y_valid-1)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

In [None]:
least_loss_model

In [None]:
least_loss_model.save("nn_model.h5")

In [None]:
y_pred = least_loss_model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1) + 1


In [None]:
print(classification_report(y_test, y_pred))

y_true_bin = np.where(np.isin(y_test, [2,3]), 1, 0)
y_pred_bin = np.where(np.isin(y_pred, [2,3]), 1, 0)
tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
print("False Negatives (FN):", fn)
print(fn/len(y_test))