In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


In [3]:
# loading the data from csv file to a Pandas DataFrame
anemia_data = pd.read_csv('anemia.csv')

# printing the first 5 rows of the dataframe
anemia_data.head()

# number of rows and columns in the dataframe
anemia_data.shape

# getting more information about the dataset
anemia_data.info()

# checking for missing values in each column
anemia_data.isnull().sum()

# getting some statistical measures about the data
anemia_data.describe()

# distribution of target Variable
anemia_data['Result'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1421 non-null   int64  
 1   Hemoglobin  1421 non-null   float64
 2   MCH         1421 non-null   float64
 3   MCHC        1421 non-null   float64
 4   MCV         1421 non-null   float64
 5   Result      1421 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 66.7 KB


Unnamed: 0_level_0,count
Result,Unnamed: 1_level_1
0,801
1,620


In [4]:
X = anemia_data.drop(columns=['Result'], axis=1)
Y = anemia_data['Result']

print(X)
print(Y)


      Gender  Hemoglobin   MCH  MCHC   MCV
0          1        14.9  22.7  29.1  83.7
1          0        15.9  25.4  28.3  72.0
2          0         9.0  21.5  29.6  71.2
3          0        14.9  16.0  31.4  87.5
4          1        14.7  22.0  28.2  99.5
...      ...         ...   ...   ...   ...
1416       0        10.6  25.4  28.2  82.9
1417       1        12.1  28.3  30.4  86.9
1418       1        13.1  17.7  28.1  80.7
1419       0        14.3  16.2  29.5  95.2
1420       0        11.8  21.2  28.4  98.1

[1421 rows x 5 columns]
0       0
1       0
2       1
3       0
4       0
       ..
1416    1
1417    1
1418    1
1419    0
1420    1
Name: Result, Length: 1421, dtype: int64


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=2
)

print(X.shape, X_train.shape, X_test.shape)


(1421, 5) (1136, 5) (285, 5)


In [6]:
model = svm.SVC(kernel='linear')

# training the SVM model with training data
model.fit(X_train, Y_train)


In [7]:
# accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

# accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)


Accuracy score of training data :  0.9964788732394366
Accuracy score of test data :  0.9894736842105263


In [8]:
input_data = (12.5, 85.0, 29.0, 34.0, 4.6)

# changing input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if prediction[0] == 0:
    print("The Person does NOT have Anemia")
else:
    print("The Person HAS Anemia")


[0]
The Person does NOT have Anemia




In [10]:
import pickle

filename = 'anemia_model.sav'
pickle.dump(model, open(filename, 'wb'))

# loading the saved model
loaded_model = pickle.load(open('anemia_model.sav', 'rb'))


In [11]:
for column in X.columns:
    print(column)


Gender
Hemoglobin
MCH
MCHC
MCV
