In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, confusion_matrix, classification_report, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

data = pd.get_dummies(data, columns=["Churn"], drop_first=True)

data = data[data["TotalCharges"] != " "]
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="ignore")

train_set, test_set = train_test_split(data, test_size = 0.2, random_state=44)

data = train_set.drop(["customerID", "Churn_Yes"], axis=1)
data_labels = train_set["Churn_Yes"].copy()

data_num = data[["tenure", "MonthlyCharges", "TotalCharges"]]
data_cat = data.drop(columns=["tenure", "MonthlyCharges", "TotalCharges"])


num_attribs = list(data_num)
cat_attribs = list(data_cat)

full_pipeline = ColumnTransformer([
    ("num", Pipeline([
        ("poly", PolynomialFeatures(degree=5,
                                    include_bias=False,
                                    interaction_only=False)),
        ("scaler", StandardScaler()),
    ]), num_attribs),
    ("cat", OneHotEncoder(drop="first"), cat_attribs),
])

new_data = full_pipeline.fit_transform(data)

class DropTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold):
        self.threshold = threshold
        print("It works!")
        
    def fit(self, X, y = None):
        
        return self
    
    def transform(self, X):

        data = pd.DataFrame(X)
        corr_matrix = data.corr(method="spearman")
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        transformed_data = np.array(data.drop(data.columns[to_drop], axis=1))
        
        return transformed_data

pipeline = Pipeline([("droper", DropTransformer(threshold = 0.95))])

data_prepared = pipeline.fit_transform(new_data)

X_train, X_val, y_train, y_val = train_test_split(data_prepared, data_labels, test_size = 0.25,
                                                  random_state=44, stratify=data_labels)

It works!


In [8]:
y = y_train

knn_clf = KNeighborsClassifier(n_neighbors = 41,
                               leaf_size = 10,
                               p = 1,
                               metric = "euclidean",
                               weights = "uniform")

knn_clf.fit(X_train, y)
y2 = y - knn_clf.predict(X_train)

In [9]:
log_clf = LogisticRegression(C=0.5,
                             solver="liblinear",
                             class_weight="balanced",
                             random_state=44)

log_clf.fit(X_train, y2)
y3 = y2 - log_clf.predict(X_train)

In [10]:
rnd_clf = RandomForestClassifier(max_depth=7,
                                 max_features="auto",
                                 min_samples_leaf=10,
                                 class_weight="balanced",
                                 random_state=44)

rnd_clf.fit(X_train, y3)
y4 = y3 - rnd_clf.predict(X_train)

In [11]:
svm_clf = SVC(kernel="rbf",
              C=1,
              gamma="auto",
              class_weight="balanced",
              random_state=44,
              probability=True)

svm_clf.fit(X_train, y4)
y5 = y4 - svm_clf.predict(X_train)

In [12]:
y_pred = sum(model.predict(X_val) for model in (knn_clf, log_clf, rnd_clf, svm_clf))

In [13]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.65      0.70      1035
           1       0.25      0.18      0.21       372
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
         252       0.00      0.00      0.00         0
         253       0.00      0.00      0.00         0
         254       0.00      0.00      0.00         0
         255       0.00      0.00      0.00         0

    accuracy                           0.52      1407
   macro avg       0.10      0.08      0.09      1407
weighted avg       0.62      0.52      0.57      1407



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
