In [2]:
import pandas as pd
import requests
import zipfile
import io
from scipy.io import arff
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import warnings
import os
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [3]:
f_zip = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip'
r = requests.get(f_zip, stream=True)
Vertebral_zip = zipfile.ZipFile(io.BytesIO(r.content))
Vertebral_zip.extractall()
data = arff.loadarff('column_2C_weka.arff')
df = pd.DataFrame(data[0])
class_mapper = {b'Abnormal':1,b'Normal':0}
df['class']=df['class'].replace(class_mapper)
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42, stratify=df['class'])
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42, stratify=test_and_validate['class'])
model = XGBClassifier(objective='binary:logistic', eval_metric='auc', num_round=42)
print(model.fit(train.drop(['class'], axis = 1).values, train['class'].values))
print("Training Completed")

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, num_round=42, ...)
Training Completed


In [4]:
test.shape

(31, 7)

In [5]:
test.head(5)

Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,1,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,0,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,1,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,1,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,1,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958


In [6]:
row = test.iloc[0:1,1:]
row.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083


In [7]:
model.predict_proba(row)

array([[0.00177544, 0.99822456]], dtype=float32)

In [8]:
test.head(5)

Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,1,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,0,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,1,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,1,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,1,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958


In [9]:
def predict_for_rows(row_indices):
    rows = test.iloc[row_indices, 1:]  # Select rows for prediction
    actual_classes = test.iloc[row_indices, 0]  # Actual classes
    predicted_classes = model.predict(rows)
    for i, index in enumerate(row_indices):
        print(f"Row {index}: Predicted class = {predicted_classes[i]}, Actual class = {actual_classes.iloc[i]}")

# Example usage:
predict_for_rows([1, 3, 5])  # Predict and compare for 2nd, 4th, and 6th rows in the test set


Row 1: Predicted class = 1, Actual class = 0
Row 3: Predicted class = 1, Actual class = 1
Row 5: Predicted class = 1, Actual class = 1


In [11]:
second_row = test.iloc[1:2, 1:]  # Changed from 0:1 to 1:2 to fetch the second row

# Predict probabilities for the second row
prediction_proba_second_row = model.predict_proba(second_row)
print("Prediction probabilities for the second row:", prediction_proba_second_row)

# Actual class for the second row
actual_class_second_row = test.iloc[1, 0]  # This gets the actual class for the second row
print("Actual class for the second row:", actual_class_second_row)

predicted_class_second_row = model.predict(second_row)
print("Predicted class for the second row:", predicted_class_second_row[0])  # Model predicts in batches, hence [0]

Prediction probabilities for the second row: [[0.33137828 0.6686217 ]]
Actual class for the second row: 0
Predicted class for the second row: 1


In [12]:
batch_X = test.iloc[:,1:];
batch_X.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958


In [13]:
predicted_probabilities = model.predict_proba(batch_X)

In [14]:
target_predicted = pd.DataFrame(predicted_probabilities[:, 1], columns=['class'])
target_predicted.head(5)

Unnamed: 0,class
0,0.998225
1,0.668622
2,0.995486
3,0.998336
4,0.961274


In [15]:
def binary_convert(x):
    threshold = 0.65
    if x > threshold:
        return 1
    else:
        return 0

target_predicted['binary'] = target_predicted['class'].apply(binary_convert)

print(target_predicted.head(10))
test.head(10)

      class  binary
0  0.998225       1
1  0.668622       1
2  0.995486       1
3  0.998336       1
4  0.961274       1
5  0.999004       1
6  0.997197       1
7  0.991417       1
8  0.997661       1
9  0.659416       1


Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,1,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,0,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,1,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,1,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,1,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958
135,1,77.121344,30.349874,77.481083,46.77147,110.611148,82.093607
100,1,84.585607,30.361685,65.479486,54.223922,108.010218,25.118478
89,1,71.186811,23.896201,43.696665,47.29061,119.864938,27.283985
297,0,45.575482,18.759135,33.774143,26.816347,116.797007,3.13191
4,1,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501


In [16]:
import pandas as pd
from sklearn.metrics import accuracy_score


# Modify the binary_convert function to accept a threshold argument
def binary_convert(x, threshold=0.65):
    if x > threshold:
        return 1
    else:
        return 0

# Function to experiment with different thresholds
def experiment_with_thresholds(data, thresholds):
    for threshold in thresholds:
        data['binary'] = data['class'].apply(binary_convert, threshold=threshold)
        print(f"Threshold: {threshold}\n", data.head(10))
       
thresholds = [0.5, 0.65, 0.75, 0.85]

# Running the experiment
experiment_with_thresholds(target_predicted, thresholds)


Threshold: 0.5
       class  binary
0  0.998225       1
1  0.668622       1
2  0.995486       1
3  0.998336       1
4  0.961274       1
5  0.999004       1
6  0.997197       1
7  0.991417       1
8  0.997661       1
9  0.659416       1
Threshold: 0.65
       class  binary
0  0.998225       1
1  0.668622       1
2  0.995486       1
3  0.998336       1
4  0.961274       1
5  0.999004       1
6  0.997197       1
7  0.991417       1
8  0.997661       1
9  0.659416       1
Threshold: 0.75
       class  binary
0  0.998225       1
1  0.668622       0
2  0.995486       1
3  0.998336       1
4  0.961274       1
5  0.999004       1
6  0.997197       1
7  0.991417       1
8  0.997661       1
9  0.659416       0
Threshold: 0.85
       class  binary
0  0.998225       1
1  0.668622       0
2  0.995486       1
3  0.998336       1
4  0.961274       1
5  0.999004       1
6  0.997197       1
7  0.991417       1
8  0.997661       1
9  0.659416       0
