In [26]:
import mlflow

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import os

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [27]:
def analyze_tree_depth_impact(X_train, X_val, y_train, y_val, 
                            min_depth=1, max_depth=20, 
                            random_state=42):
    
    max_depths = range(min_depth, max_depth + 1)
    auc_scores = []
    
    plt.figure(figsize=(10, 6))
    
    # Для каждой глубины вычисляем ROC-AUC
    for depth in max_depths:
        with mlflow.start_run(nested=True):
            # Обучаем модель с текущей глубиной
            dt = DecisionTreeClassifier(
                max_depth=depth, 
                random_state=random_state
            )
            dt.fit(X_train, y_train)
            
            # Получаем вероятности для положительного класса
            y_pred_proba = dt.predict_proba(X_val)[:, 1]
            
            # Вычисляем ROC и AUC
            fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
            roc_auc = auc(fpr, tpr)
            auc_scores.append(roc_auc)
            
            # Логируем метрику для текущей глубины
            mlflow.log_metric(f"auc_depth_{depth}", roc_auc)
            
            # Логируем параметры
            mlflow.log_params({
                "depth": depth,
                "random_state": random_state
            })
    
    # Создаем график
    plt.plot(max_depths, auc_scores, marker='o')
    plt.xlabel('Максимальная глубина дерева')
    plt.ylabel('ROC-AUC Score')
    plt.title('Зависимость ROC-AUC от глубины дерева')
    plt.grid(True)
    
    # Сохраняем график как артефакт
    plt.savefig('roc_auc_depth.png')
    mlflow.log_artifact('roc_auc_depth.png')

In [19]:
os.environ["AWS_ACCESS_KEY_ID"] = "mlflow"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = f"http://localhost:9000"

In [20]:
mlflow.set_tracking_uri('http://localhost:9909')

In [21]:
experiment_name = "credit_scoring"
mlflow.set_experiment(experiment_name)
mlflow.autolog()

2024/12/02 20:55:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [22]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [23]:
train_data.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [24]:
X = train_data.drop('loan_status', axis=1)
y = train_data['loan_status']

categorical_features = X.select_dtypes(include=['object']).columns
for feature in categorical_features:
    le = LabelEncoder()
    X[feature] = le.fit_transform(X[feature].astype(str))

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:

with mlflow.start_run():
    rf_model = DecisionTreeClassifier(max_depth=3, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred = rf_model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Accuracy: {accuracy}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    analyze_tree_depth_impact(X_train, X_val, y_train, y_val)

Accuracy: 0.9260806547872794

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     10087
           1       0.87      0.56      0.68      1642

    accuracy                           0.93     11729
   macro avg       0.90      0.77      0.82     11729
weighted avg       0.92      0.93      0.92     11729

🏃 View run polite-goose-834 at: http://localhost:9909/#/experiments/2/runs/f33d64f231484dd89a938cbc3ab1c0de
🧪 View experiment at: http://localhost:9909/#/experiments/2
🏃 View run serious-crane-307 at: http://localhost:9909/#/experiments/2/runs/1bac1219ae8144ec806a91c93b86073a
🧪 View experiment at: http://localhost:9909/#/experiments/2
🏃 View run invincible-mole-608 at: http://localhost:9909/#/experiments/2/runs/8adfadefa8a649d68a42a787026fe439
🧪 View experiment at: http://localhost:9909/#/experiments/2
🏃 View run rumbling-hound-442 at: http://localhost:9909/#/experiments/2/runs/d51a80672ae342179175ed67f4317c7e
🧪 

## 1. Глубина 6
Такое деревце у нас есть как побочный артефакт первого эксперимента

In [36]:
name, version = "tiny_tree_6", 1
tiny_tree_model = mlflow.sklearn.load_model(f"models:/{model_name}/{version}")

In [37]:
with mlflow.start_run():
    tiny_tree_model.fit(X_train, y_train)
    
    y_pred_tiny_tree = tiny_tree_model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred_tiny_tree)
    print(f"Accuracy: {accuracy}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred_tiny_tree))

Accuracy: 0.9465427572683093

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     10087
           1       0.93      0.67      0.78      1642

    accuracy                           0.95     11729
   macro avg       0.94      0.83      0.87     11729
weighted avg       0.95      0.95      0.94     11729

🏃 View run industrious-owl-913 at: http://localhost:9909/#/experiments/2/runs/6e928f2793df44c2a2cee2322d3793de
🧪 View experiment at: http://localhost:9909/#/experiments/2


## 2. XGBoost

In [None]:
import xgboost as xgb
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
import numpy as np
import pandas as pd

with mlflow.start_run(run_name="xgboost_experiment"):
    model = xgb.XGBClassifier(
        n_estimators=100,
        random_state=42
    )
    
    eval_set = [(X_val, y_val)]
    model.fit(
        X_train, y_train,
        eval_set=eval_set,
        verbose=False
    )
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    

    fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.savefig('roc_curve.png')
    mlflow.log_artifact('roc_curve.png')
    plt.close()
    
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True')
    plt.xlabel('Predicted')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()
    
    plt.figure(figsize=(10, 6))
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title('Top 10 Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    mlflow.log_artifact('feature_importance.png')
    plt.close()
    
    report = classification_report(y_val, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("classification_report.txt")
    
    mlflow.log_metrics({
        "accuracy": (y_pred == y_val).mean(),
        "roc_auc": roc_auc
    })
    
    mlflow.xgboost.log_model(model, "model")
    
    print(f"ROC-AUC score: {roc_auc:.4f}")
    print("\nClassification Report:")
    print(report)

