In [1]:
# -------------------------------------------------------
# Week 6 â€“ Final Project: Drug Classification Prediction
# -------------------------------------------------------

# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Load dataset
# You can download dataset from: https://www.kaggle.com/datasets/prathamtripathi/drug-classification
data = pd.read_csv("drug200.csv")

# Step 3: Explore dataset
print("First 5 rows of dataset:")
print(data.head())
print("\nDataset Info:")
print(data.info())

# Step 4: Encode categorical columns
le_sex = LabelEncoder()
le_BP = LabelEncoder()
le_Chol = LabelEncoder()
le_Drug = LabelEncoder()

data["Sex"] = le_sex.fit_transform(data["Sex"])
data["BP"] = le_BP.fit_transform(data["BP"])
data["Cholesterol"] = le_Chol.fit_transform(data["Cholesterol"])
data["Drug"] = le_Drug.fit_transform(data["Drug"])

# Step 5: Split data into features and target
X = data.drop("Drug", axis=1)
y = data["Drug"]

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Build and train Decision Tree model
model = DecisionTreeClassifier(criterion="entropy", random_state=42)
model.fit(X_train, y_train)

# Step 8: Make predictions
y_pred = model.predict(X_test)

# Step 9: Evaluate model
acc = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", round(acc * 100, 2), "%")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 10: Predict for a new sample
sample = np.array([[45, 1, 1, 0, 15.5]])  # Example: Age=45, Male, BP=High, Chol=Normal, Na_to_K=15.5
predicted_drug = le_Drug.inverse_transform(model.predict(sample))
print("\nPredicted Drug for sample input:", predicted_drug[0])


First 5 rows of dataset:
   Age Sex      BP Cholesterol  Na_to_K   Drug
0   23   F    HIGH        HIGH   25.355  DrugY
1   47   M     LOW        HIGH   13.093  drugC
2   47   M     LOW        HIGH   10.114  drugC
3   28   F  NORMAL        HIGH    7.798  drugX
4   61   F     LOW        HIGH   18.043  DrugY

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB
None

Model Accuracy: 100.0 %

Confusion Matrix:
[[15  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 11]]

Classification Report:
            

