In [1]:
import sklearn

Day 3 – Classification Algorithms in Scikit-learn

🎯 Goal:

Understand and use the most popular classification algorithms:

Logistic Regression

K-Nearest Neighbors (KNN)

Decision Tree

Random Forest

In [2]:
import pandas as pd
import numpy as np

data = {
    'age': [22, 35, np.nan, 29, 41, 33, np.nan, 27],
    'income': [50000, 64000, 58000, np.nan, 72000, np.nan, 46000, 51000],
    'education': ['Bachelors', 'Masters', 'Bachelors', 'PhD', np.nan, 'Masters', 'PhD', 'Bachelors'],
    'city': ['Delhi', 'Mumbai', np.nan, 'Bangalore', 'Chennai', 'Delhi', 'Mumbai', 'Delhi'],
    'target': [1, 0, 1, 0, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

X = df.drop('target', axis=1)
y = df['target']

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_features = ['age', 'income']
categorical_features = ['education', 'city']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                test_size=0.3, random_state=42)

1. Logisitic Regrission

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

logistic_pipeline.fit(X_train, y_train)
y_pred_log = logistic_pipeline.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))

Logistic Regression Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


2. K-Nearest Neighbors

In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())])

knn_pipeline.fit(X_train, y_train)
y_pred_knn = knn_pipeline.predict(X_test)

print("knn Accuaracy: ", accuracy_score(y_test, y_pred_knn))

knn Accuaracy:  0.6666666666666666


3. Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

tree_pipeline.fit(X_train, y_train)
y_pred_tree = tree_pipeline.predict(X_test)

print("Tree Accuracy: ", accuracy_score(y_test, y_pred_tree))



Tree Accuracy:  0.6666666666666666


4. Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

forest_pipeline.fit(X_train, y_train)

y_pred_forest = forest_pipeline.predict(X_test)

print("accuracy: ", accuracy_score(y_test, y_pred_forest))

accuracy:  0.6666666666666666
