In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report , accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('drug200.csv')
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [4]:
df.describe

<bound method NDFrame.describe of      Age Sex      BP Cholesterol  Na_to_K   Drug
0     23   F    HIGH        HIGH   25.355  DrugY
1     47   M     LOW        HIGH   13.093  drugC
2     47   M     LOW        HIGH   10.114  drugC
3     28   F  NORMAL        HIGH    7.798  drugX
4     61   F     LOW        HIGH   18.043  DrugY
..   ...  ..     ...         ...      ...    ...
195   56   F     LOW        HIGH   11.567  drugC
196   16   M     LOW        HIGH   12.006  drugC
197   52   M  NORMAL        HIGH    9.894  drugX
198   23   M  NORMAL      NORMAL   14.020  drugX
199   40   F     LOW      NORMAL   11.349  drugX

[200 rows x 6 columns]>

In [6]:
data_type = df.dtypes
data_type

Age              int64
Sex             object
BP              object
Cholesterol     object
Na_to_K        float64
Drug            object
dtype: object

In [7]:
null = df.isnull().sum()
null

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
df['Drug'].unique()


array(['DrugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype=object)

In [10]:
X=df.drop(columns='Drug')
y=df['Drug']

In [11]:
X=pd.get_dummies(X,dtype=float,drop_first=True)
X

Unnamed: 0,Age,Na_to_K,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL
0,23,25.355,0.0,0.0,0.0,0.0
1,47,13.093,1.0,1.0,0.0,0.0
2,47,10.114,1.0,1.0,0.0,0.0
3,28,7.798,0.0,0.0,1.0,0.0
4,61,18.043,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
195,56,11.567,0.0,1.0,0.0,0.0
196,16,12.006,1.0,1.0,0.0,0.0
197,52,9.894,1.0,0.0,1.0,0.0
198,23,14.020,1.0,0.0,1.0,1.0


In [12]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)

# Random Forest Classifier

In [13]:
model1 = RandomForestClassifier(n_estimators= 100)
model1.fit(X_train , y_train)

RandomForestClassifier()

In [14]:
model1.score(X_train , y_train)

1.0

In [17]:
model1_predictions = model1.predict(X_test)

In [18]:
model1_accuracy = accuracy_score(y_test , model1_predictions)

In [19]:
print(classification_report(y_test , model1_predictions))

              precision    recall  f1-score   support

       DrugY       1.00      1.00      1.00        15
       drugA       1.00      1.00      1.00         6
       drugB       1.00      1.00      1.00         3
       drugC       1.00      1.00      1.00         5
       drugX       1.00      1.00      1.00        11

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



# support Vector Classifier

In [27]:
from sklearn import svm
model2 = svm.SVC(kernel='linear', random_state = 42)
model2.fit(X_train, y_train)

SVC(kernel='linear', random_state=42)

In [28]:
model2.score(X_train , y_train)

1.0

In [29]:
model2_predictions = model2.predict(X_test)

In [30]:
model2_accuracy = accuracy_score(y_test , model2_predictions)

In [31]:
print(classification_report(y_test , model2_predictions))

              precision    recall  f1-score   support

       DrugY       1.00      1.00      1.00        15
       drugA       1.00      1.00      1.00         6
       drugB       1.00      1.00      1.00         3
       drugC       1.00      1.00      1.00         5
       drugX       1.00      1.00      1.00        11

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



# Decision Tree Classifier

In [32]:
model3 = DecisionTreeClassifier()
model3.fit(X_train, y_train)

DecisionTreeClassifier()

In [33]:
model3.score(X_train , y_train)

1.0

In [34]:
model3_predictions = model3.predict(X_test)

In [35]:
model3_accuracy=accuracy_score(y_test , model3_predictions)
model3_accuracy

1.0

In [36]:
print(classification_report(y_test , model3_predictions))

              precision    recall  f1-score   support

       DrugY       1.00      1.00      1.00        15
       drugA       1.00      1.00      1.00         6
       drugB       1.00      1.00      1.00         3
       drugC       1.00      1.00      1.00         5
       drugX       1.00      1.00      1.00        11

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



# Scores

In [38]:
models=pd.DataFrame({
    'models':['Random_forest','SVC','Decision_Tree'],
    'scores':[model1_accuracy,model2_accuracy,model3_accuracy]})
models.sort_values(by='scores',ascending=False)

Unnamed: 0,models,scores
0,Random_forest,1.0
1,SVC,1.0
2,Decision_Tree,1.0
