# Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import altair as alt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from charts import score_dist_chart
from utils import make_classification_df

In [None]:
alt.renderers.enable(
    "default", embed_options={"renderer": "svg", "scaleFactor": 5},
)

In [None]:
SEED = 0

# Data

In [None]:
df_ml = make_classification_df(SEED, n_samples=5000, target_dist=[0.8, 0.2])
df_ml.head()

In [None]:
# Imbalanced
df_ml["target"].value_counts(dropna=False)

In [None]:
X = df_ml.drop("target", axis=1)
y = df_ml["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=SEED, stratify=y, test_size=0.2
)

In [None]:
y_train.value_counts(normalize=True, dropna=False)

In [None]:
y_test.value_counts(normalize=True, dropna=False)

# Model

In [None]:
clf = RandomForestClassifier(random_state=SEED)
clf.fit(X_train, y_train)

In [None]:
y_pred_proba = clf.predict_proba(X_test)
y_pred_proba

In [None]:
df_scores = pd.DataFrame({"y_true": y_test, "y_score": y_pred_proba[:, 1]})
df_scores.head()

# Score Distribution Chart

In [None]:
score_dist_chart(df_scores, "y_score", "y_true", main=1, silhouette=0)

---