In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing and reading the data using panda, the next line also provides a summary of the data
# note that the feature column, 'Diagnosis' has a datatype of 'object'. That would have to be fixed.

data = pd.read_csv('fertility_Diagnosis.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Season                         100 non-null    float64
 1   Age                            100 non-null    float64
 2   Childish-Disease               100 non-null    int64  
 3   Accident-trauma                100 non-null    int64  
 4   Surgery                        100 non-null    int64  
 5   High-fevers                    100 non-null    int64  
 6   Alcohol-Consumption-frequency  100 non-null    float64
 7   Smoking-habit                  100 non-null    int64  
 8   Hours-sitting                  100 non-null    float64
 9   Diagnosis                      100 non-null    object 
dtypes: float64(4), int64(5), object(1)
memory usage: 7.9+ KB


In [3]:
# Extracted both the independent(X) and dependent(y) variables. y has been transformed to binary
# values using a for loop then transformed to an integer.

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

for i in range(0, len(y)):
  if y[i] == 'N':
    y[i] = 1 # This is Normal
  else:
    y[i] = 0 # This is altered/not normal
y = y.astype('int')

In [4]:
# Splitting the data set into a train and test set to avoid information leakage.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [5]:
# USING KNN classifier

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)

In [6]:
y_pred = classifier.predict(X_test)

In [7]:
# This line gives us a tabular(somewhat) summary of the classification performance of our model

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]]


In [8]:
# This is more informed and rigorous, using the confusion matrix and accuracy score to better
# evaluate the performance of our model

from sklearn.metrics import confusion_matrix, accuracy_score
confusion = confusion_matrix(y_test, y_pred)
print(confusion)
accuracy_score(y_test, y_pred)

[[ 0  2]
 [ 0 23]]


0.92