In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('drug.csv')
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [5]:
df.isnull().sum().sum()

np.int64(0)

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.describe

<bound method NDFrame.describe of      Age Sex      BP Cholesterol  Na_to_K   Drug
0     23   F    HIGH        HIGH   25.355  DrugY
1     47   M     LOW        HIGH   13.093  drugC
2     47   M     LOW        HIGH   10.114  drugC
3     28   F  NORMAL        HIGH    7.798  drugX
4     61   F     LOW        HIGH   18.043  DrugY
..   ...  ..     ...         ...      ...    ...
195   56   F     LOW        HIGH   11.567  drugC
196   16   M     LOW        HIGH   12.006  drugC
197   52   M  NORMAL        HIGH    9.894  drugX
198   23   M  NORMAL      NORMAL   14.020  drugX
199   40   F     LOW      NORMAL   11.349  drugX

[200 rows x 6 columns]>

In [10]:
df['Drug'].unique()

array(['DrugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype=object)

In [11]:
df['Drug'].value_counts()

Drug
DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: count, dtype: int64

In [13]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = le.fit_transform(df[i])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    int64  
 2   BP           200 non-null    int64  
 3   Cholesterol  200 non-null    int64  
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 9.5 KB


In [16]:
# Model Building

x = df.drop('Drug', axis = 1)
y = df['Drug']

In [17]:
x

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,0,0,0,25.355
1,47,1,1,0,13.093
2,47,1,1,0,10.114
3,28,0,2,0,7.798
4,61,0,1,0,18.043
...,...,...,...,...,...
195,56,0,1,0,11.567
196,16,1,1,0,12.006
197,52,1,2,0,9.894
198,23,1,2,1,14.020


In [18]:
y

0      0
1      3
2      3
3      4
4      0
      ..
195    3
196    3
197    4
198    4
199    4
Name: Drug, Length: 200, dtype: int64

In [23]:
# Train test split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, random_state = 0)

In [24]:
# SVC --By Default --> Kernelv --> RBF--

from sklearn.svm import SVC
model = SVC()

# training
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# import evaluation part

from sklearn.metrics import *
print(f'Accuracy Score of SVM: {accuracy_score(y_test, y_pred) * 100}')

Accuracy Score: 82.5


In [25]:
# SVC --By Default --> Kernelv --> Linear--

from sklearn.svm import SVC
model = SVC(kernel = 'linear')

# training
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# import evaluation part

from sklearn.metrics import *
print(f'Accuracy Score of SVM: {accuracy_score(y_test, y_pred) * 100}')

Accuracy Score: 100.0


In [26]:
# SVC --By Default --> Kernelv --> Linear--

from sklearn.svm import SVC
model = SVC(kernel = 'poly')

# training
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# import evaluation part

from sklearn.metrics import *
print(f'Accuracy Score of SVM: {accuracy_score(y_test, y_pred) * 100}')

Accuracy Score: 80.0


In [30]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(max_iter = 2000)
model_lr.fit(x_train, y_train)

y_pred_lr = model_lr.predict(x_test)
print(f'Accuracy Score of Logistic Regression: {accuracy_score(y_test, y_pred_lr) * 100}')

Accuracy Score of Logistic Regression: 97.5


In [32]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()
model_dt.fit(x_train, y_train)

y_pred_dt = model_dt.predict(x_test)
print(f'Accuracy Score of Decision Tree: {accuracy_score(y_test, y_pred_dt) * 100}')

Accuracy Score of Decision Tree: 100.0


In [34]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()

model_rf.fit(x_train, y_train)

y_pred_rf = model_rf.predict(x_test)
print(f'Accuracy Score of Random Forest: {accuracy_score(y_test, y_pred_rf) * 100}')

Accuracy Score of Random Forest: 100.0
