# Aim: To Detect Correlation, and Extract Non-Correlated Variables

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import scipy
import seaborn as sns

In [114]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [115]:
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [116]:
import sys
import os
import math
import csv
import json

## Helper Methods and Variables

In [117]:
def sentence_to_snake_case(sentence):
    # Split the sentence into words, convert to lowercase, and join with underscores
    snake_case = '_'.join(word.lower() for word in sentence.split())
    return snake_case

In [118]:
useless_cols = ['CustomerID', 'Count', 'Country', 'State', 'City', \
                'Zip Code', 'Lat Long', 'Churn Score', 'Churn Label', 'CLTV', 'Churn Reason', 'Total Charges']

In [119]:
y_cols = ['Churn Value', 'Tenure Months']

In [120]:
short_term_cols = ['Phone Service', 'Multiple Lines', 'Internet Service', \
                   'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', \
                   'Streaming TV', 'Streaming Movies']

## Prepare the data

In [121]:
os.getcwd()

'C:\\D Drive\\University of Washington\\Study\\Quarter 2\\DATA 557\\Project\\ML'

In [122]:
data = pd.read_csv('../data/Telco_customer_churn_cleaned.csv')

In [123]:
data.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [124]:
data = data.rename(columns = lambda column: sentence_to_snake_case(column))

In [125]:
data.head()

Unnamed: 0,unnamed:_0,customerid,count,country,state,city,zip_code,lat_long,latitude,longitude,...,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn_label,churn_value,churn_score,cltv,churn_reason
0,0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [126]:
# Leaving in Customer ID just in case
useless_cols = [sentence_to_snake_case(column) for column in useless_cols]

In [127]:
y_cols = [sentence_to_snake_case(column) for column in y_cols]

In [128]:
y_cols

['churn_value', 'tenure_months']

In [129]:
# Drop the first column "Unnamed" and the useless columns
data = data.drop([data.columns[0]] + useless_cols, axis=1)

In [130]:
data.head()

Unnamed: 0,latitude,longitude,gender,senior_citizen,partner,dependents,tenure_months,phone_service,multiple_lines,internet_service,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,churn_value
0,33.964131,-118.272783,Male,No,No,No,2,Yes,No,DSL,...,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,1
1,34.059281,-118.30742,Female,No,No,Yes,2,Yes,No,Fiber optic,...,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,1
2,34.048013,-118.293953,Female,No,No,Yes,8,Yes,Yes,Fiber optic,...,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,1
3,34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,...,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,1
4,34.039224,-118.266293,Male,No,No,Yes,49,Yes,Yes,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,1


## Get either the best case or worst case scenario

### The data being set from here will decide how the model will treat its unknown churn customers

**Our y is a check on whether the user will churn within 12 months or not**

In [131]:
def best_case_churn(row):
    # User has definitely not churned within the 12 months
    if row['tenure_months'] >= 12: return 0
    # User has definitely churned within the 12 months
    if row['tenure_months'] < 12 and row['churn_value'] == 1: return 1
    # Best case assumption
    return 0

In [132]:
def worst_case_churn(row):
    # User has definitely not churned within the 12 months
    if row['tenure_months'] >= 12: return 0
    # User has definitely churned within the 12 months
    if row['tenure_months'] < 12 and row['churn_value'] == 1: return 1
    # Best case assumption
    return 1

### Switch the following line to switch between best case and worst case

In [133]:
main_y_col = 'churn'

In [134]:
data[main_y_col] = data.apply(worst_case_churn, axis=1)

In [135]:
data['tenure_months'].unique()

array([ 2,  8, 28, 49, 10,  1, 47, 17,  5, 34, 11, 15, 18,  9,  7, 12, 25,
       68, 55, 37,  3, 27, 20,  4, 58, 53, 13,  6, 19, 59, 16, 52, 24, 32,
       38, 54, 43, 63, 21, 69, 22, 61, 60, 48, 40, 23, 39, 35, 56, 65, 33,
       30, 45, 46, 62, 70, 50, 44, 71, 26, 14, 41, 66, 64, 29, 42, 67, 51,
       31, 57, 36, 72,  0], dtype=int64)

In [136]:
y_cols.extend([main_y_col])
y_cols

['churn_value', 'tenure_months', 'churn']

In [137]:
y_cols

['churn_value', 'tenure_months', 'churn']

## Divide the dataset into train and test test

In [138]:
RNG = np.random.RandomState(seed=420)

In [139]:
train, test = train_test_split(data, test_size=0.1, shuffle=True, random_state=RNG)

In [140]:
train.shape

(6338, 22)

In [141]:
test.shape

(705, 22)

## Save the dataset

In [142]:
# data.to_csv('./data/ml_best_case.csv')

## Pre-process the data

### Normalize

**Remember to save normalization details of train data, to apply to test data**

Pipeline transforms automatically do so for us.

In [143]:
# Prepare to Standardize all numeric columns except Churn Value
numeric_cols = list(data.select_dtypes(include=['int64', 'float64']))
numeric_cols = [column for column in numeric_cols if column not in y_cols]
numeric_cols

['latitude', 'longitude', 'monthly_charges']

In [144]:
# Pipeline is to maintain consistency
standardization_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [145]:
# ColumnTransformer helps standardize only selected columns
preprocessor = ColumnTransformer(
    transformers=[('num', standardization_transformer, numeric_cols)],
    remainder='passthrough'  # This leaves the rest of the columns in the dataset unchanged
)

In [146]:
preprocessed_train_array = preprocessor.fit(train).transform(train)

In [147]:
preprocessed_columns = [col.split('__')[1] for col in preprocessor.get_feature_names_out()]
# preprocessed_columns

## Warning!! Bad Coding Practice Alert

In [148]:
final_columns = [column for column in train.columns if (column not in y_cols or column == main_y_col)]
# final_columns

In [149]:
# Instead of mapping preprocessed_columns with the old data.columns and getting the dtypes
# for now we will proceed with hard-coding logic
# Because columns that have prefix num are obviously float
# While other columns are either object or can be treated as object (except Churn Value which will separately be made int64)
preprocessed_column_types = ['float64' if col.split('__')[0]=='num' else 'object' for col in preprocessor.get_feature_names_out()]

In [150]:
preprocessed_column_types[-1] = 'int64'

In [151]:
preprocessed_column_dtypes = {preprocessed_columns[i]: preprocessed_column_types[i] for i in range(len(preprocessed_columns))}

In [152]:
preprocessed_train = pd.DataFrame(data=preprocessed_train_array, columns=preprocessed_columns)
preprocessed_train = preprocessed_train.astype(preprocessed_column_dtypes)

In [153]:
preprocessed_train.head()

Unnamed: 0,latitude,longitude,monthly_charges,gender,senior_citizen,partner,dependents,tenure_months,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,churn_value,churn
0,-0.498158,-0.371946,1.295086,Male,No,Yes,Yes,60,Yes,Yes,...,Yes,No,Yes,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),0,0
1,-1.423199,1.358143,-1.516232,Female,No,No,No,20,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Credit card (automatic),0,0
2,0.444907,-1.026476,0.965322,Male,No,No,No,17,Yes,No,...,No,No,No,Yes,Yes,Month-to-month,No,Electronic check,1,0
3,1.202459,-0.117674,-1.514566,Female,No,No,No,29,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,0,0
4,-0.899465,0.660012,0.653879,Female,No,No,No,49,Yes,Yes,...,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,0,0


In [154]:
preprocessed_train = preprocessed_train[final_columns]

In [155]:
preprocessed_test_array = preprocessor.transform(test)

In [156]:
preprocessed_test = pd.DataFrame(data=preprocessed_test_array, columns=preprocessed_columns)
preprocessed_test = preprocessed_test.astype(preprocessed_column_dtypes)

In [157]:
preprocessed_test.head()

Unnamed: 0,latitude,longitude,monthly_charges,gender,senior_citizen,partner,dependents,tenure_months,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,churn_value,churn
0,1.077896,-1.383298,1.331726,Female,Yes,Yes,No,62,Yes,Yes,...,No,Yes,No,Yes,Yes,One year,Yes,Electronic check,0,0
1,-0.903949,0.703809,-1.479591,Male,No,No,No,47,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),0,0
2,-0.863037,0.738054,-0.165534,Male,No,No,No,9,Yes,Yes,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,0,1
3,0.495985,-1.031974,0.868725,Female,Yes,No,No,32,Yes,No,...,No,No,No,Yes,Yes,Month-to-month,No,Electronic check,0,0
4,0.898898,-0.799467,0.184215,Female,No,No,No,4,Yes,No,...,No,No,No,No,No,Month-to-month,No,Electronic check,1,1


In [158]:
preprocessed_test = preprocessed_test[final_columns]

In [159]:
# preprocessed_test.head()

## Get One-hot Encoding of the Object Data Columns

### Train Dataset

In [160]:
object_columns = [column for column in preprocessed_train.columns if preprocessed_train[column].dtype=='object']

In [161]:
preprocessed_train_encoded = pd.get_dummies(preprocessed_train, columns=object_columns)

In [162]:
preprocessed_train_encoded

Unnamed: 0,latitude,longitude,monthly_charges,churn,gender_Female,gender_Male,senior_citizen_No,senior_citizen_Yes,partner_No,partner_Yes,...,streaming_movies_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperless_billing_No,paperless_billing_Yes,payment_method_Bank transfer (automatic),payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check
0,-0.498158,-0.371946,1.295086,0,False,True,True,False,False,True,...,True,True,False,False,False,True,True,False,False,False
1,-1.423199,1.358143,-1.516232,0,True,False,True,False,True,False,...,False,False,True,False,False,True,False,True,False,False
2,0.444907,-1.026476,0.965322,0,False,True,True,False,True,False,...,True,True,False,False,True,False,False,False,True,False
3,1.202459,-0.117674,-1.514566,0,True,False,True,False,True,False,...,False,False,False,True,True,False,False,False,False,True
4,-0.899465,0.660012,0.653879,0,True,False,True,False,True,False,...,True,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6333,0.180098,-0.149465,-1.512901,0,False,True,True,False,False,True,...,False,False,True,False,False,True,True,False,False,False
6334,-1.035110,1.309279,-1.517897,1,True,False,True,False,False,True,...,False,True,False,False,True,False,False,False,False,True
6335,-0.844825,0.647366,-0.317092,0,False,True,True,False,False,True,...,True,False,True,False,False,True,False,True,False,False
6336,0.392377,-0.207744,-1.489584,0,False,True,True,False,True,False,...,False,False,True,False,True,False,False,True,False,False


In [163]:
preprocessed_train_encoded = preprocessed_train_encoded.rename(columns = lambda column: sentence_to_snake_case(column))

In [164]:
preprocessed_train_encoded

Unnamed: 0,latitude,longitude,monthly_charges,churn,gender_female,gender_male,senior_citizen_no,senior_citizen_yes,partner_no,partner_yes,...,streaming_movies_yes,contract_month-to-month,contract_one_year,contract_two_year,paperless_billing_no,paperless_billing_yes,payment_method_bank_transfer_(automatic),payment_method_credit_card_(automatic),payment_method_electronic_check,payment_method_mailed_check
0,-0.498158,-0.371946,1.295086,0,False,True,True,False,False,True,...,True,True,False,False,False,True,True,False,False,False
1,-1.423199,1.358143,-1.516232,0,True,False,True,False,True,False,...,False,False,True,False,False,True,False,True,False,False
2,0.444907,-1.026476,0.965322,0,False,True,True,False,True,False,...,True,True,False,False,True,False,False,False,True,False
3,1.202459,-0.117674,-1.514566,0,True,False,True,False,True,False,...,False,False,False,True,True,False,False,False,False,True
4,-0.899465,0.660012,0.653879,0,True,False,True,False,True,False,...,True,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6333,0.180098,-0.149465,-1.512901,0,False,True,True,False,False,True,...,False,False,True,False,False,True,True,False,False,False
6334,-1.035110,1.309279,-1.517897,1,True,False,True,False,False,True,...,False,True,False,False,True,False,False,False,False,True
6335,-0.844825,0.647366,-0.317092,0,False,True,True,False,False,True,...,True,False,True,False,False,True,False,True,False,False
6336,0.392377,-0.207744,-1.489584,0,False,True,True,False,True,False,...,False,False,True,False,True,False,False,True,False,False


In [165]:
preprocessed_train_encoded.to_csv('./data/preprocessed_train_encoded.csv')

### Test Dataset

In [166]:
preprocessed_test_encoded = pd.get_dummies(preprocessed_test, columns=object_columns)

In [167]:
preprocessed_test_encoded

Unnamed: 0,latitude,longitude,monthly_charges,churn,gender_Female,gender_Male,senior_citizen_No,senior_citizen_Yes,partner_No,partner_Yes,...,streaming_movies_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperless_billing_No,paperless_billing_Yes,payment_method_Bank transfer (automatic),payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check
0,1.077896,-1.383298,1.331726,0,True,False,False,True,False,True,...,True,False,True,False,False,True,False,False,True,False
1,-0.903949,0.703809,-1.479591,0,False,True,True,False,True,False,...,False,False,False,True,True,False,True,False,False,False
2,-0.863037,0.738054,-0.165534,1,False,True,True,False,True,False,...,True,True,False,False,True,False,False,False,False,True
3,0.495985,-1.031974,0.868725,0,True,False,False,True,True,False,...,True,True,False,False,True,False,False,False,True,False
4,0.898898,-0.799467,0.184215,1,True,False,True,False,True,False,...,False,True,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,0.565070,-1.052524,0.917023,0,True,False,True,False,False,True,...,True,False,False,True,False,True,False,False,True,False
701,1.201881,-1.740768,-1.489584,1,False,True,True,False,True,False,...,False,True,False,False,True,False,True,False,False,False
702,0.935368,-0.782134,-0.168865,0,True,False,True,False,False,True,...,False,False,False,True,False,True,True,False,False,False
703,1.791356,-1.034135,-1.532887,1,False,True,True,False,False,True,...,False,True,False,False,True,False,False,False,False,True


In [168]:
preprocessed_test_encoded = preprocessed_test_encoded.rename(columns = lambda column: sentence_to_snake_case(column))

In [169]:
preprocessed_test_encoded

Unnamed: 0,latitude,longitude,monthly_charges,churn,gender_female,gender_male,senior_citizen_no,senior_citizen_yes,partner_no,partner_yes,...,streaming_movies_yes,contract_month-to-month,contract_one_year,contract_two_year,paperless_billing_no,paperless_billing_yes,payment_method_bank_transfer_(automatic),payment_method_credit_card_(automatic),payment_method_electronic_check,payment_method_mailed_check
0,1.077896,-1.383298,1.331726,0,True,False,False,True,False,True,...,True,False,True,False,False,True,False,False,True,False
1,-0.903949,0.703809,-1.479591,0,False,True,True,False,True,False,...,False,False,False,True,True,False,True,False,False,False
2,-0.863037,0.738054,-0.165534,1,False,True,True,False,True,False,...,True,True,False,False,True,False,False,False,False,True
3,0.495985,-1.031974,0.868725,0,True,False,False,True,True,False,...,True,True,False,False,True,False,False,False,True,False
4,0.898898,-0.799467,0.184215,1,True,False,True,False,True,False,...,False,True,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,0.565070,-1.052524,0.917023,0,True,False,True,False,False,True,...,True,False,False,True,False,True,False,False,True,False
701,1.201881,-1.740768,-1.489584,1,False,True,True,False,True,False,...,False,True,False,False,True,False,True,False,False,False
702,0.935368,-0.782134,-0.168865,0,True,False,True,False,False,True,...,False,False,False,True,False,True,True,False,False,False
703,1.791356,-1.034135,-1.532887,1,False,True,True,False,False,True,...,False,True,False,False,True,False,False,False,False,True


In [170]:
preprocessed_test_encoded.to_csv('./data/preprocessed_test_encoded.csv')

## Find Correlations and remove Perfectly correlated variables.

In [171]:
correlation_matrix = preprocessed_train_encoded.corr()

In [181]:
correlation_matrix.head()

Unnamed: 0,latitude,longitude,monthly_charges,churn,gender_female,gender_male,senior_citizen_no,senior_citizen_yes,partner_no,partner_yes,...,streaming_movies_yes,contract_month-to-month,contract_one_year,contract_two_year,paperless_billing_no,paperless_billing_yes,payment_method_bank_transfer_(automatic),payment_method_credit_card_(automatic),payment_method_electronic_check,payment_method_mailed_check
latitude,1.0,-0.877206,-0.020473,-0.017142,-0.008965,0.008965,0.011495,-0.011495,0.007535,-0.007535,...,-0.000805,-0.009192,0.003901,0.006999,0.017724,-0.017724,0.009987,-0.01063,-0.003595,0.004641
longitude,-0.877206,1.0,0.02568,0.011172,0.011392,-0.011392,-0.006748,0.006748,-0.008153,0.008153,...,0.002081,0.00855,0.002468,-0.012313,-0.021633,0.021633,-0.013828,0.003964,0.017923,-0.010499
monthly_charges,-0.020473,0.02568,1.0,-0.188319,0.009222,-0.009222,-0.213676,0.213676,-0.108773,0.108773,...,0.625604,0.05815,0.002542,-0.070189,-0.342714,0.342714,0.053603,0.029909,0.266553,-0.383736
churn,-0.017142,0.011172,-0.188319,1.0,-0.002507,0.002507,0.02238,-0.02238,0.304541,-0.304541,...,-0.219195,0.480506,-0.252479,-0.31964,0.003653,-0.003653,-0.188348,-0.185664,0.162249,0.184988
gender_female,-0.008965,0.011392,0.009222,-0.002507,1.0,-1.0,-0.001139,0.001139,-0.001135,0.001135,...,0.004105,0.007104,-0.013011,0.004107,-0.007656,0.007656,0.016176,0.000807,-0.007845,-0.007904


In [174]:
correlation_copy = correlation_matrix.copy(deep = True)

In [176]:
np.fill_diagonal(correlation_copy.values, 0)

In [180]:
correlation_copy.head()

Unnamed: 0,latitude,longitude,monthly_charges,churn,gender_female,gender_male,senior_citizen_no,senior_citizen_yes,partner_no,partner_yes,...,streaming_movies_yes,contract_month-to-month,contract_one_year,contract_two_year,paperless_billing_no,paperless_billing_yes,payment_method_bank_transfer_(automatic),payment_method_credit_card_(automatic),payment_method_electronic_check,payment_method_mailed_check
latitude,0.0,-0.877206,-0.020473,-0.017142,-0.008965,0.008965,0.011495,-0.011495,0.007535,-0.007535,...,-0.000805,-0.009192,0.003901,0.006999,0.017724,-0.017724,0.009987,-0.01063,-0.003595,0.004641
longitude,-0.877206,0.0,0.02568,0.011172,0.011392,-0.011392,-0.006748,0.006748,-0.008153,0.008153,...,0.002081,0.00855,0.002468,-0.012313,-0.021633,0.021633,-0.013828,0.003964,0.017923,-0.010499
monthly_charges,-0.020473,0.02568,0.0,-0.188319,0.009222,-0.009222,-0.213676,0.213676,-0.108773,0.108773,...,0.625604,0.05815,0.002542,-0.070189,-0.342714,0.342714,0.053603,0.029909,0.266553,-0.383736
churn,-0.017142,0.011172,-0.188319,0.0,-0.002507,0.002507,0.02238,-0.02238,0.304541,-0.304541,...,-0.219195,0.480506,-0.252479,-0.31964,0.003653,-0.003653,-0.188348,-0.185664,0.162249,0.184988
gender_female,-0.008965,0.011392,0.009222,-0.002507,0.0,-1.0,-0.001139,0.001139,-0.001135,0.001135,...,0.004105,0.007104,-0.013011,0.004107,-0.007656,0.007656,0.016176,0.000807,-0.007845,-0.007904


In [178]:
perfect_corr_cols = correlation_copy.index[(correlation_copy == 1.0).any(axis=1)].tolist()

In [179]:
perfect_corr_cols

['phone_service_no',
 'multiple_lines_no_phone_service',
 'internet_service_no',
 'online_security_no_internet_service',
 'online_backup_no_internet_service',
 'device_protection_no_internet_service',
 'tech_support_no_internet_service',
 'streaming_tv_no_internet_service',
 'streaming_movies_no_internet_service']

In [189]:
perfect_corr_cols2 = [column for column in perfect_corr_cols if column not in ['phone_service_no', 'internet_service_no']]

In [190]:
perfect_corr_cols2

['multiple_lines_no_phone_service',
 'online_security_no_internet_service',
 'online_backup_no_internet_service',
 'device_protection_no_internet_service',
 'tech_support_no_internet_service',
 'streaming_tv_no_internet_service',
 'streaming_movies_no_internet_service']

In [194]:
preprocessed_train_encoded.drop(columns=perfect_corr_cols2).to_csv('./data/preprocessed_train_encoded_no_corr.csv')

In [195]:
preprocessed_test_encoded.drop(columns=perfect_corr_cols2).to_csv('./data/preprocessed_test_encoded_no_corr.csv')

In [196]:
preprocessed_train_encoded.drop(columns=perfect_corr_cols2).columns

Index(['latitude', 'longitude', 'monthly_charges', 'churn', 'gender_female',
       'gender_male', 'senior_citizen_no', 'senior_citizen_yes', 'partner_no',
       'partner_yes', 'dependents_no', 'dependents_yes', 'phone_service_no',
       'phone_service_yes', 'multiple_lines_no', 'multiple_lines_yes',
       'internet_service_dsl', 'internet_service_fiber_optic',
       'internet_service_no', 'online_security_no', 'online_security_yes',
       'online_backup_no', 'online_backup_yes', 'device_protection_no',
       'device_protection_yes', 'tech_support_no', 'tech_support_yes',
       'streaming_tv_no', 'streaming_tv_yes', 'streaming_movies_no',
       'streaming_movies_yes', 'contract_month-to-month', 'contract_one_year',
       'contract_two_year', 'paperless_billing_no', 'paperless_billing_yes',
       'payment_method_bank_transfer_(automatic)',
       'payment_method_credit_card_(automatic)',
       'payment_method_electronic_check', 'payment_method_mailed_check'],
      dtype='obj