<a href="https://colab.research.google.com/github/ktcraig/13-Mini-Project/blob/main/Mini_Project_13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [103]:
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [104]:
# Import data
file_path = "vertebral-column.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544,Hernia
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Hernia
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Hernia
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Hernia
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Hernia


In [105]:
# Check the value_counts of the target column
df["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
Spondylolisthesis,150
Normal,100
Hernia,60


# Preprocess the data

In [106]:
# Check the data types
df.dtypes

Unnamed: 0,0
pelvic_incidence,float64
pelvic_tilt,float64
lumbar_lordosis_angle,float64
sacral_slope,float64
pelvic_radius,float64
degree_spondylolisthesis,float64
class,object


In [107]:
# Get the target variable (the "class" column)
y = df["class"]
y

Unnamed: 0,class
0,Hernia
1,Hernia
2,Hernia
3,Hernia
4,Hernia
...,...
305,Normal
306,Normal
307,Normal
308,Normal


In [108]:
# Get the features (everything except the "class" column)
X = df.copy()
X = X.drop(columns="class")
X.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501


In [109]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [110]:
# Use Standard Scaler to scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [111]:
# Since the target column is an object, we need to convert the data to numerical classes
# Encode the y data
# Create an instance of the label encoder
le = LabelEncoder()

# Fit and transform the y training and testing data using the label encoder
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
y_train_encoded

array([2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 0, 2, 2, 1, 1, 2, 0, 0, 1, 1,
       2, 2, 1, 0, 0, 0, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 0, 1, 2, 2, 2, 2,
       1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 0, 1, 2, 2, 2, 2, 2, 1, 1, 1,
       2, 1, 0, 2, 2, 2, 2, 1, 1, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2,
       2, 1, 2, 0, 0, 0, 2, 2, 1, 2, 2, 0, 2, 1, 2, 1, 0, 2, 1, 2, 0, 1,
       2, 1, 0, 1, 1, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 1, 1, 0, 1, 2, 1, 0,
       1, 0, 2, 0, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 0, 2, 1, 2, 0, 1, 2, 0,
       2, 1, 1, 2, 0, 1, 2, 2, 1, 2, 1, 0, 1, 1, 1, 2, 0, 0, 2, 1, 1, 1,
       2, 2, 0, 1, 2, 2, 0, 0, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 2, 1, 2, 1,
       2, 1, 1, 0, 0, 2, 2, 1, 2, 2, 0, 0, 1, 1, 0, 2, 1, 2, 1, 1, 2, 2,
       2, 2, 2, 1, 2, 1, 0, 2, 2, 2, 1, 2])

## Model and Fit to a Logistic Regression Classifier

In [112]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=1)

# Fit the model to the training data
lr_model.fit(X_train_scaled, y_train_encoded)

In [113]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % lr_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % lr_model.score(X_test_scaled, y_test_encoded))

Train Accuracy: 0.875
Test Accuracy: 0.808


## Model and Fit to a Support Vector Machine

In [114]:
# Create the support vector machine classifier model with a 'poly' kernel
svm_model = SVC(kernel='poly')

# Fit the model to the training data
svm_model.fit(X_train_scaled, y_train_encoded)

In [115]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % svm_model.score(X_test_scaled, y_test_encoded))

Train Accuracy: 0.806
Test Accuracy: 0.718


## Model and Fit to a KNN Model

In [116]:
# Create the KNN model with 9 neighbors
knn_model = KNeighborsClassifier(n_neighbors=9)

# Fit the model to the training data
knn_model.fit(X_train_scaled, y_train_encoded)

In [127]:
# Validate the model by checking the model accuracy with model.scoreprint('Train Accuracy: %.3f' % knn_model.score(X_train_scaled, y_train_encoded))
print('Train Accuracy: %.3f' % knn_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % knn_model.score(X_test_scaled, y_test_encoded))

Train Accuracy: 0.836
Test Accuracy: 0.744


## Model and Fit to a Decision Tree Classifier

In [118]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train_scaled, y_train_encoded)

In [119]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % dt_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % dt_model.score(X_test_scaled, y_test_encoded))

Train Accuracy: 1.000
Test Accuracy: 0.731


## Model and Fit to a Random Forest Classifier

In [120]:
# Create the random forest classifier model
# with n_estimators=128 and random_state=1
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

# Fit the model to the training data
rf_model.fit(X_train_scaled, y_train_encoded)

In [121]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % rf_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % rf_model.score(X_test_scaled, y_test_encoded))

Train Accuracy: 1.000
Test Accuracy: 0.821


## Compare all of the model scores

In [134]:
# Print Logistic Regression Classifier
print('LOGISTIC REGRESSION CLASSIFIER')
print('Train Accuracy: %.3f' % lr_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % lr_model.score(X_test_scaled, y_test_encoded))
print(' ')

# Print Support Vector Machine
print('SUPPORT VECTOR MACHINE')
print('Train Accuracy: %.3f' % svm_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % svm_model.score(X_test_scaled, y_test_encoded))
print(' ')

# Print KNN Scores
print('KNN')
print('Train Accuracy: %.3f' % knn_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % knn_model.score(X_test_scaled, y_test_encoded))
print(' ')

# Print Decision Tree Scores
print('DECISION TREE')
print('Train Accuracy: %.3f' % dt_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % dt_model.score(X_test_scaled, y_test_encoded))
print(' ')

# Print Random Forest Scores
print('RANDOM FOREST')
print('Train Accuracy: %.3f' % rf_model.score(X_train_scaled, y_train_encoded))
print('Test Accuracy: %.3f' % rf_model.score(X_test_scaled, y_test_encoded))


LOGISTIC REGRESSION CLASSIFIER
Train Accuracy: 0.875
Test Accuracy: 0.808
 
SUPPORT VECTOR MACHINE
Train Accuracy: 0.806
Test Accuracy: 0.718
 
KNN
Train Accuracy: 0.836
Test Accuracy: 0.744
 
DECISION TREE
Train Accuracy: 1.000
Test Accuracy: 0.731
 
RANDOM FOREST
Train Accuracy: 1.000
Test Accuracy: 0.821


## Which is the most accurate model?

**Logisitic Regression Classifier** because the test data outperforms the training data. In comparison, we believe the Decision Tree and Random Forest are overfit due to the perfect training data accuracy.