In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [21]:
df = pd.read_csv('datasets/diabetes.csv')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

The dataset mostly consists of objects.

In [23]:
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


We will need to encode 'Yes' and 'No' strings as well as 'Positive' and 'Negative'

In [24]:
#checking for missing values
df.isnull().values.any()

False

The dataset does not have any missing values so we don't need to impute them.

In [25]:
#checking for duplicates
df.duplicated().values.any()

True

It seems like the dataset has duplicated values. Let's investigate it further.

In [26]:
#subsetting all the duplicated rows
duplicates = df[df.duplicated(subset=None, keep=False)]

In [27]:
duplicates.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
17,67,Male,No,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Positive
18,66,Male,Yes,Yes,No,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,No,Positive
19,43,Male,Yes,Yes,Yes,Yes,No,Yes,No,No,No,No,No,No,No,No,Positive
20,62,Male,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,No,Yes,Yes,No,No,Positive
21,54,Male,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,No,Yes,Yes,No,Positive


In [28]:
duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 17 to 504
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 376 non-null    int64 
 1   Gender              376 non-null    object
 2   Polyuria            376 non-null    object
 3   Polydipsia          376 non-null    object
 4   sudden weight loss  376 non-null    object
 5   weakness            376 non-null    object
 6   Polyphagia          376 non-null    object
 7   Genital thrush      376 non-null    object
 8   visual blurring     376 non-null    object
 9   Itching             376 non-null    object
 10  Irritability        376 non-null    object
 11  delayed healing     376 non-null    object
 12  partial paresis     376 non-null    object
 13  muscle stiffness    376 non-null    object
 14  Alopecia            376 non-null    object
 15  Obesity             376 non-null    object
 16  class               376 n

Apparently, there are a lot of duplicates. However, since the dataset does not provide us with any primary ID (like patient's name or his ID number) and the features represent age and the symptoms which are common among most of the diabetic patients in a binary form ('Yes' or 'No'), it seems reasonable to assume that those duplicates are actualy different patients of same age who have same symptoms.

In [29]:
#selecting the observations where class is 'Negative'
neg_class = df[df['class'] == 'Negative']

In [30]:
neg_class.shape

(200, 17)

In [31]:
#selecting the observation where class is 'Positive'
pos_class = df[df['class'] == 'Positive']

In [32]:
pos_class.shape

(320, 17)

As we can see, the dataset is unbalanced, since there are 120 more 'Positive' values than 'Negative' values.

In [33]:
datapath = 'D://Tutorials/SDST/My Projects/Capstone2/Data Wrangling'
if not os.path.exists(datapath):
    os.mkdir(datapath)

In [34]:
datapath_datawrangling = os.path.join(datapath, 'Diabetes_Data_Wrangling.csv')
if not os.path.exists(datapath_datawrangling):
    df.to_csv(datapath_datawrangling, index=False)