In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/drug_data.csv')

#### Show Top 5 Records

In [9]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


#### Shape of the dataset

In [4]:
df.shape

(200, 6)

### Dataset information

The target feature is:

Drug type

The feature sets are:

- Age
- Sex
- Blood Pressure Levels (BP)
- Cholesterol Levels
- Na to Potassium Ration

### Check Missing values

In [5]:
df.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

#### There are no missing values in the data set

### Check Duplicates

In [6]:
df.duplicated().sum()

np.int64(0)

#### There are no duplicates  values in the data set

### Check data types

In [7]:
# Check Null and Dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [10]:
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

print("Numerical Features: ", numerical_features)
print("Categorical Features: ", categorical_features)

Numerical Features:  ['Age', 'Na_to_K']
Categorical Features:  ['Sex', 'BP', 'Cholesterol', 'Drug']


### Checking the number of unique values of each column

In [11]:
df.nunique()

Age             57
Sex              2
BP               3
Cholesterol      2
Na_to_K        198
Drug             5
dtype: int64

### Check statistics of data set

In [12]:
df.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


### Exploring Data

In [13]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [14]:
print("Categories in 'Sex' variable:     ",end=" " )
print(df['Sex'].unique())
print("Categories in 'BP' variable:     ", end=" " )
print(df['BP'].unique())
print("Categories in 'Cholesterol' variable:     ",end=" " )
print(df['Cholesterol'].unique())
print("Categories in 'Drug' variable:     ", end=" " )
print(df['Drug'].unique())

Categories in 'Sex' variable:      ['F' 'M']
Categories in 'BP' variable:      ['HIGH' 'LOW' 'NORMAL']
Categories in 'Cholesterol' variable:      ['HIGH' 'NORMAL']
Categories in 'Drug' variable:      ['DrugY' 'drugC' 'drugX' 'drugA' 'drugB']
