In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import (
    model_selection,
    preprocessing,
    dummy,
    metrics,
    ensemble,
    tree,
    neighbors,
    pipeline,
    compose,
    linear_model,
    svm,
)
import xgboost as xgb

In [21]:
CAT_COLS = [
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
]
NUM_COLS = ["tenure", "MonthlyCharges", "TotalCharges"]
TARGET = ["Churn"]
SEED = 69420

df = pd.read_csv(
    "../data/WA_Fn-UseC_-Telco-Customer-Churn.csv", usecols=CAT_COLS + NUM_COLS + TARGET
)
df.shape

(7043, 20)

In [5]:
CAT_COLS_OHE = ["PaymentMethod", "Contract", "InternetService"]
CAT_COLS_LE = list(set(CAT_COLS) - set(CAT_COLS_OHE))

In [35]:
X, y = df.drop(columns=TARGET), df[TARGET]


def to_object(x):
    return np.array(x).astype("float32")


df.TotalCharges = df.TotalCharges.replace(to_replace=" ", value="0")


input_preprocessor = compose.ColumnTransformer(
    transformers=[
        (
            "change_dtype_TotalCharges_",
            preprocessing.FunctionTransformer(to_object),
            ["TotalCharges"],
        ),
        ("num_col_StandardScaler_", preprocessing.StandardScaler(), NUM_COLS),
        ("cat_col_OneHotEncoder_", preprocessing.OneHotEncoder(), CAT_COLS_OHE),
        ("cat_col_LabelEncoder_", preprocessing.LabelEncoder(), CAT_COLS),
    ]
)

target_preprocessor = compose.ColumnTransformer(
    transformers=[("target_col_LabelEncoder_", preprocessing.LabelEncoder(), TARGET)]
)

In [28]:
print(df.shape)
df[df["TotalCharges"] == " "].shape

(7043, 20)


(11, 20)

In [26]:
df["TotalCharges"].unique()

array(['29.85', '1889.5', '108.15', ..., '346.45', '306.6', '6844.5'],
      dtype=object)

In [37]:
totalChanges_transformer = compose.ColumnTransformer(
    transformers=[
        (
            "change_dtype_TotalCharges_",
            preprocessing.FunctionTransformer(to_object),
            ["TotalCharges"],
        ),
    ]
)
totalChanges_transformer.fit(X["TotalCharges"])

IndexError: tuple index out of range

In [33]:
X, y = df.drop(columns=TARGET), df[TARGET]

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=SEED
)

input_preprocessor.fit(X)

# X_train, y_train = input_preprocessor.fit(X_train).transform(
#     X_train
# ), target_preprocessor.fit(y_train).transform(y_train)
# X_test, y_test = input_preprocessor.transform(X_test), target_preprocessor.transform(
#     y_test
# )

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [9]:
def run_experiments(
    models, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, seed=SEED
) -> pd.DataFrame:
    results = dict()
    for name, model in models:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = metrics.accuracy_score(y_test, predictions)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
            y_test, predictions, average="weighted"
        )
        print(f"{name} --> {fscore}")
        results[name] = (accuracy, precision, recall, fscore)
    return pd.DataFrame.from_dict(results)


models = [
    (
        "dummy_classifier",
        dummy.DummyClassifier(random_state=SEED, strategy="most_frequent"),
    ),
    ("k_nearest_neighbors", neighbors.KNeighborsClassifier()),
    (
        "logistic_regression",
        linear_model.LogisticRegression(
            random_state=SEED, solver="liblinear", class_weight="balanced"
        ),
    ),
    ("support_vector_machines", svm.SVC(random_state=SEED, kernel="rbf")),
    ("random_forest", ensemble.RandomForestClassifier(random_state=SEED)),
    ("gradient_boosting", ensemble.GradientBoostingClassifier(random_state=SEED)),
    ("decision_tree", tree.DecisionTreeClassifier(random_state=SEED)),
    ("adaboost", ensemble.AdaBoostClassifier()),
    (
        "voting",
        ensemble.VotingClassifier(
            estimators=[
                ("gbc", ensemble.GradientBoostingClassifier()),
                ("lr", linear_model.LogisticRegression()),
                ("abc", ensemble.AdaBoostClassifier()),
            ],
            voting="soft",
        ),
    ),
]

results = run_experiments(models)
results.columns = ["accuracy", "precision", "recall", "fscore"]
results = results.sort_values(by=["fscore"], ascending=False)
results

dummy_classifier --> 0.6224062443822016


  _warn_prf(average, modifier, msg_start, len(result))


ValueError: could not convert string to float: 'Male'