In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import os

In [29]:
df = pd.read_csv('datasets/diabetes.csv')

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

The dataset mostly consists of objects.

In [31]:
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


We will need to encode 'Yes' and 'No' strings as well as 'Positive' and 'Negative'

In [32]:
#checking for missing values
df.isnull().values.any()

False

The dataset does not have any missing values so we don't need to impute them.

In [33]:
#checking for duplicates
df.duplicated().values.any()

True

It seems like the dataset has duplicated values. Let's investigate it further.

In [34]:
#subsetting all the duplicated rows
duplicates = df[df.duplicated()]

In [35]:
duplicates.count()

Age                   269
Gender                269
Polyuria              269
Polydipsia            269
sudden weight loss    269
weakness              269
Polyphagia            269
Genital thrush        269
visual blurring       269
Itching               269
Irritability          269
delayed healing       269
partial paresis       269
muscle stiffness      269
Alopecia              269
Obesity               269
class                 269
dtype: int64

In [36]:
duplicates['Age'].value_counts().sort_values(ascending=False).head()

30    17
48    17
43    16
35    15
53    13
Name: Age, dtype: int64

In [37]:
duplicates['Gender'].value_counts().sort_values(ascending=False).head()

Male      168
Female    101
Name: Gender, dtype: int64

In [38]:
duplicates['class'].value_counts().sort_values(ascending=False).head()

Positive    147
Negative    122
Name: class, dtype: int64

Apparently, there are a lot of duplicated values. However, since the dataset does not provide us with any primary ID (like name of the patient or his ID number) and the features represent age and the symptoms which are common among most of the diabetic patients in a binary form ('Yes' or 'No'), it seems reasonable to assume that those duplicates are actualy different patients of same age who have same symptoms.

In [39]:
#selecting the observations where class is 'Negative'
neg_class = df[df['class'] == 'Negative']

In [40]:
neg_class.shape

(200, 17)

In [41]:
#selecting the observation where class is 'Positive'
pos_class = df[df['class'] == 'Positive']

In [42]:
pos_class.shape

(320, 17)

As we can see, the dataset is unbalanced, since there are 120 more 'Positive' values than 'Negative' values.

Now, let's check is our categorical values are binary.

In [44]:
df_enc = df

In [48]:
#numerical column
num_col = ['Age']

In [50]:
#categorical columns
cat_cols = df_enc.columns.drop(num_col)

In [56]:
#see how many unique values for each feature
df_enc[cat_cols].apply(lambda x: x.nunique(), axis=0)

Gender                2
Polyuria              2
Polydipsia            2
sudden weight loss    2
weakness              2
Polyphagia            2
Genital thrush        2
visual blurring       2
Itching               2
Irritability          2
delayed healing       2
partial paresis       2
muscle stiffness      2
Alopecia              2
Obesity               2
class                 2
dtype: int64

It seems like all our categorical columns have two unique values. Now we can actually encode them. We will use following encoders:
Male - 0
Female - 1
No - 0
Yes - 1
Negative - 0 
Positive - 1

In [None]:
label_encoders = {'Male':0, 'Female':1, 'No':0, 'Yes':1, 'Negative':0, 'Positive':1}