In [17]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
data = data[data["TotalCharges"] != " "]
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="ignore")

In [4]:
train_set, test_set = train_test_split(data, test_size = 0.2, random_state=44)

data = train_set.drop(["customerID", "MonthlyCharges"], axis=1)
data_labels = train_set["MonthlyCharges"].copy()

In [26]:
print("MonthlyCharges:\nmean: {}\nstd: {}".format(round(data_labels.mean(), 2), round(data_labels.std(), 2)))

MonthlyCharges:
mean: 64.67
std: 30.1


In [5]:
data_num = data[["tenure", "TotalCharges"]]
data_cat = data.drop(columns=["tenure", "TotalCharges"])

In [6]:
num_attribs = list(data_num)
cat_attribs = list(data_cat)

full_pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_attribs),
    ("cat", OneHotEncoder(drop="first"), cat_attribs),
])

new_data = full_pipeline.fit_transform(data)

In [7]:
class DropTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold = 0.8):
        self.threshold = threshold
        print("It works!")
        
    def fit(self, X, y = None):
        
        return self
    
    def transform(self, X):

        data = pd.DataFrame(X)
        corr_matrix = data.corr(method="spearman")
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        transformed_data = np.array(data.drop(data.columns[to_drop], axis=1))
        
        return transformed_data

In [33]:
pipeline = Pipeline([("droper", DropTransformer(threshold = 0.7))])

data_prepared = pipeline.fit_transform(new_data)

It works!


In [34]:
X_train, X_val, y_train, y_val = train_test_split(data_prepared, data_labels, test_size = 0.25, random_state=44)

In [35]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train);

In [36]:
y_pred = lin_reg.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
rmse

1.021280819363999

In [37]:
y_pred = lin_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

1.028813099444233