In [0]:
%python
df = spark.table("workspace.default.customer_churn_dataset_training_master")

display(df)

In [0]:
%python
print("Số dòng:", df.count())
print("Số cột:", len(df.columns))

In [0]:
%python
df.printSchema()

In [0]:
%python
display(df.describe())

In [0]:
%python
from pyspark.sql.functions import col, sum, when

missing_df = df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df.columns
])

display(missing_df)

In [0]:
%python
df = df.dropna()

In [0]:
%python
for c in df.columns:
    print(c, df.select(c).distinct().count())

In [0]:
%python
import matplotlib.pyplot as plt
import seaborn as sns

churn_pd = df.select("Churn").toPandas()

plt.figure(figsize=(6,4))
ax = sns.countplot(x='Churn', data=churn_pd)

for p in ax.patches:
    ax.annotate(
        f'{int(p.get_height())}',
        (p.get_x() + p.get_width()/2., p.get_height()),
        ha='center',
        va='bottom'
    )

plt.show()

In [0]:
%python
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# lấy các cột categorical
categorical_cols = [f.name for f in df.schema.fields 
                    if f.dataType.simpleString() == 'string']

print("Categorical columns:", categorical_cols)

# indexer
indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_index")
    for c in categorical_cols
]

# one hot encoder
encoders = [
    OneHotEncoder(inputCol=c+"_index", outputCol=c+"_vec")
    for c in categorical_cols
]

pipeline = Pipeline(stages=indexers + encoders)

df_processed = pipeline.fit(df).transform(df)

display(df_processed)

# Train and test data split

# Mlflow

In [0]:
%python
import mlflow
import mlflow.spark
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

In [0]:
string_cols = [f.name for f in df_processed.schema.fields 
               if f.dataType.simpleString() == "string"]

print(string_cols)

In [0]:
from pyspark.ml.feature import StringIndexer

indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep")
    for c in string_cols
]

In [0]:
from pyspark.ml.feature import OneHotEncoder

encoders = [
    OneHotEncoder(inputCol=c+"_index", outputCol=c+"_vec")
    for c in string_cols
]

In [0]:
numeric_cols = [f.name for f in df_processed.schema.fields
                if f.dataType.simpleString() != "string"
                and f.name != "Churn"]

feature_cols = numeric_cols + [c+"_vec" for c in string_cols]

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

In [0]:
cols_to_drop = [c for c in df_processed.columns 
                if c.endswith("_index") or c.endswith("_vec")]

df_clean = df_processed.drop(*cols_to_drop)

df_final = pipeline.fit(df_clean).transform(df_clean)

In [0]:
%python
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

In [0]:
%python
from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier
)

models = {
    "Logistic Regression": LogisticRegression(labelCol="Churn", featuresCol="features"),
    "Decision Tree": DecisionTreeClassifier(labelCol="Churn", featuresCol="features"),
    "Random Forest": RandomForestClassifier(labelCol="Churn", featuresCol="features"),
    "Gradient Boosted Trees": GBTClassifier(labelCol="Churn", featuresCol="features")
}

In [0]:
%python
import os
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Set UC temp path
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/main/default/mlflow_volume/tmp"

evaluator = BinaryClassificationEvaluator(
    labelCol="Churn",
    metricName="areaUnderROC"
)

for model_name, model in models.items():
    
    with mlflow.start_run(run_name=model_name):
        
        fitted_model = model.fit(train_df)
        predictions = fitted_model.transform(test_df)
        auc = evaluator.evaluate(predictions)
        
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("AUC", auc)
        
        mlflow.spark.log_model(fitted_model, "model")
        
        print(f"{model_name} - AUC: {auc}")