### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

### Creating Random Dataset with Categorical and Continous Variables

In [2]:
np.random.seed(0)
data = pd.DataFrame({
    'age': np.random.randint(18, 70, size=100),
    'income': np.random.randint(20000, 100000, size=100),
    'score': np.random.uniform(0, 1, size=100),
    'gender': np.random.choice(['Male', 'Female'], size=100),
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago'], size=100),
    'purchased': np.random.choice([0, 1], size=100)
})

### EDA

In [3]:
data.head()

Unnamed: 0,age,income,score,gender,city,purchased
0,62,57237,0.998847,Female,Chicago,0
1,65,99701,0.149448,Male,Los Angeles,1
2,18,28752,0.868126,Female,New York,0
3,21,91331,0.162493,Female,Chicago,0
4,21,70624,0.61556,Male,New York,0


In [4]:
data.describe()

Unnamed: 0,age,income,score,purchased
count,100.0,100.0,100.0,100.0
mean,41.57,61144.98,0.470791,0.46
std,15.537023,24686.931926,0.29133,0.500908
min,18.0,20469.0,0.011714,0.0
25%,29.0,36709.25,0.212445,0.0
50%,41.0,59264.0,0.45362,0.0
75%,55.25,83122.75,0.718159,1.0
max,69.0,99983.0,0.998847,1.0


In [5]:
data.isnull().sum()

age          0
income       0
score        0
gender       0
city         0
purchased    0
dtype: int64

In [8]:
for col in data.columns:
    print(f'{col}  ->  {type(col)}')

age  ->  <class 'str'>
income  ->  <class 'str'>
score  ->  <class 'str'>
gender  ->  <class 'str'>
city  ->  <class 'str'>
purchased  ->  <class 'str'>


### Transforming categorical and continous columns using StandardScalar and OneHotEncoder Transformers

In [9]:
numeric_features = ['age', 'income', 'score']
categorical_features = ['gender', 'city']

### Combine Transformers using ColumnTransformer

In [10]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Splitting the dataset

In [11]:
X = data.drop('purchased', axis=1)
y = data['purchased']

In [17]:
X

Unnamed: 0,age,income,score,gender,city
0,62,57237,0.998847,Female,Chicago
1,65,99701,0.149448,Male,Los Angeles
2,18,28752,0.868126,Female,New York
3,21,91331,0.162493,Female,Chicago
4,21,70624,0.615560,Male,New York
...,...,...,...,...,...
95,32,50752,0.716860,Male,Chicago
96,33,99464,0.396060,Male,Chicago
97,38,91892,0.565421,Male,Chicago
98,53,53930,0.183280,Male,New York


In [18]:
y

0     0
1     1
2     0
3     0
4     0
     ..
95    1
96    1
97    1
98    0
99    0
Name: purchased, Length: 100, dtype: int32

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Transforming the dataset

In [13]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [19]:
X_train

array([[-0.33370406, -1.52357347, -1.06562026,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [ 0.85977871, -1.14030493, -0.95844338,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-1.33874219,  1.19152386, -1.14834894,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-1.46437196,  0.06245833,  0.48000109,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.92259359,  1.22370118,  0.86595873,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [-0.96185289, -1.35906239, -0.82768535,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [-0.96185289,  0.5400036 ,  1.24454579,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [-0.08244453,  0.20256459, -1.20994209,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [ 0.67133406, -1.23603146

In [20]:
X_test

array([[-1.02466778, -0.40107228, -0.93391212,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.83622313, -1.47560953, -0.23312628,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-1.52718684, -1.32865703,  1.31477011,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 1.6135573 ,  1.07642651, -1.62941025,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.85977871, -0.99750046, -1.06756477,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.73414894, -0.86903281,  0.09043488,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-1.46437196,  1.49142937,  1.27175581,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ],
       [ 1.04822336, -0.29221709, -0.76071527,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.83622313,  1.10042862

### Defining models

In [14]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression()
}

### Training the models

In [15]:
results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[model_name] = {"accuracy": accuracy, "report": report}

### Results

In [16]:
results

{'Decision Tree': {'accuracy': 0.35,
  'report': '              precision    recall  f1-score   support\n\n           0       0.40      0.36      0.38        11\n           1       0.30      0.33      0.32         9\n\n    accuracy                           0.35        20\n   macro avg       0.35      0.35      0.35        20\nweighted avg       0.35      0.35      0.35        20\n'},
 'Random Forest': {'accuracy': 0.6,
  'report': '              precision    recall  f1-score   support\n\n           0       0.62      0.73      0.67        11\n           1       0.57      0.44      0.50         9\n\n    accuracy                           0.60        20\n   macro avg       0.59      0.59      0.58        20\nweighted avg       0.60      0.60      0.59        20\n'},
 'KNN': {'accuracy': 0.5,
  'report': '              precision    recall  f1-score   support\n\n           0       0.54      0.64      0.58        11\n           1       0.43      0.33      0.38         9\n\n    accuracy     