In [9]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load cleaned data
df = pd.read_csv("../data/cleaned_data.csv")

# Split data
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train initial model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate initial model
y_pred = model.predict(X_test)
print("Initial Model Accuracy:", accuracy_score(y_test, y_pred))
print("Initial Model Classification Report:")
print(classification_report(y_test, y_pred))

# Hyperparameter tuning
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Evaluate best model
y_pred = best_model.predict(X_test)
print("Best Model Accuracy:", accuracy_score(y_test, y_pred))
print("Best Model Classification Report:")
print(classification_report(y_test, y_pred))

# Save best model
joblib.dump(best_model, "../ml_model/decision_tree_model.pkl")

Initial Model Accuracy: 0.9853658536585366
Initial Model Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205

Best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Model Accuracy: 0.9853658536585366
Best Model Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



['../ml_model/decision_tree_model.pkl']