Title: Popular Classification Algorithms


Decision Trees


Task 1: Predict the loan default risk based on borrower characteristics.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Sample dataset (replace with your real dataset)
data = {
    'income': [2500, 4000, 6000, 1200, 3000, 7000, 1500, 5000],
    'age': [25, 45, 35, 22, 30, 40, 23, 37],
    'credit_score': [600, 720, 680, 500, 650, 740, 520, 700],
    'loan_amount': [1000, 2000, 1500, 800, 1200, 3000, 700, 2500],
    'default': [1, 0, 0, 1, 0, 0, 1, 0]  # 1 = defaulted, 0 = paid
}
df = pd.DataFrame(data)

# Features and target
X = df.drop('default', axis=1)
y = df['default']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y)

# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["No Default", "Default"]))

Confusion Matrix:
[[1 0]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

  No Default       0.50      1.00      0.67         1
     Default       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Task 2: Determine if a patient should be tested for a disease based on symptoms.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Sample dataset
data = {
    'fever': [1, 0, 1, 1, 0, 0, 1, 0],
    'cough': [1, 1, 1, 0, 0, 1, 1, 0],
    'fatigue': [0, 0, 1, 1, 0, 1, 1, 0],
    'shortness_of_breath': [1, 0, 0, 1, 0, 0, 1, 0],
    'should_test': [1, 0, 1, 1, 0, 0, 1, 0]  # 1 = should test, 0 = no need
}
df = pd.DataFrame(data)

# Split features and target
X = df.drop('should_test', axis=1)
y = df['should_test']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["No Test", "Test"]))

Confusion Matrix:
[[1 0]
 [0 1]]

Classification Report:
              precision    recall  f1-score   support

     No Test       1.00      1.00      1.00         1
        Test       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



Task 3: Classify types of animals based on features like size, habitat, and diet.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Sample dataset
data = {
    'size': ['Small', 'Large', 'Medium', 'Small', 'Large', 'Medium', 'Small', 'Large'],
    'habitat': ['Land', 'Water', 'Air', 'Water', 'Land', 'Air', 'Air', 'Water'],
    'diet': ['Herbivore', 'Carnivore', 'Omnivore', 'Carnivore', 'Omnivore', 'Herbivore', 'Carnivore', 'Omnivore'],
    'type': ['Mammal', 'Reptile', 'Bird', 'Reptile', 'Mammal', 'Bird', 'Bird', 'Reptile']
}
df = pd.DataFrame(data)

# Encode all categorical variables
label_encoders = {}
for col in ['size', 'habitat', 'diet', 'type']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features and target
X = df[['size', 'habitat', 'diet']]
y = df['type']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

# Train Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Decode predictions for readable output
type_decoder = label_encoders['type']
print("\nConfusion Matrix:")
print(confusion_matrix(type_decoder.inverse_transform(y_test), type_decoder.inverse_transform(y_pred)))

print("\nClassification Report:")
print(classification_report(type_decoder.inverse_transform(y_test), type_decoder.inverse_transform(y_pred)))

ValueError: The test_size = 2 should be greater or equal to the number of classes = 3