# Import the packages


In [1]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lars
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from tabulate import tabulate


# Load data
## Data 1

In [2]:
df1 = pd.read_csv("Data1.csv")
df1.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#The dataset has the following features:

- CustomerID: A unique ID that identifies each customer.

- Gender: The customer’s gender: Male, Female

- Senior Citizen: Indicates if the customer is 65 or older: Yes, No

- Partner: Indicate if the customer has a partner: Yes, No

- Dependents: Indicates if the customer lives with any dependents: Yes, No. Dependents could be children, parents, grandparents, etc.

- Tenure Months: Indicates the total amount of months that the customer has been with the company by the end of the quarter specified above.

- Phone Service: Indicates if the customer subscribes to home phone service with the company: Yes, No

- Multiple Lines: Indicates if the customer subscribes to multiple telephone lines with the company: Yes, No

- Internet Service: Indicates if the customer subscribes to Internet service with the company: No, DSL, Fiber Optic, Cable.

- Online Security: Indicates if the customer subscribes to an additional online security service provided by the company: Yes, No

- Online Backup: Indicates if the customer subscribes to an additional online backup service provided by the company: Yes, No

- Device Protection: Indicates if the customer subscribes to an additional device protection plan for their Internet equipment provided by the company: Yes, No

- Tech Support: Indicates if the customer subscribes to an additional technical support plan from the company with reduced wait times: Yes, No

- Streaming TV: Indicates if the customer uses their Internet service to stream television programing from a third party provider: Yes, No. The company does not charge an additional fee for this service.

- Streaming Movies: Indicates if the customer uses their Internet service to stream movies from a third party provider: Yes, No. The company does not charge an additional fee for this service.

- Contract: Indicates the customer’s current contract type: Month-to-Month, One Year, Two Year.

- Paperless Billing: Indicates if the customer has chosen paperless billing: Yes, No

- Payment Method: Indicates how the customer pays their bill: Bank Withdrawal, Credit Card, Mailed Check

- Monthly Charge: Indicates the customer’s current total monthly charge for all their services from the company.

- Total Charges: Indicates the customer’s total charges, calculated to the end of the quarter specified above.

- Churn Label: Yes = the customer left the company this quarter. No = the customer remained with the company. Directly related to Churn Value.

## Data 2

In [3]:
df2 = pd.read_excel("Data2.xlsx")
df2.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


#The dataset has the following features:

- CustomerID: A unique ID that identifies each customer.

- Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

- Country: The country of the customer’s primary residence.

- State: The state of the customer’s primary residence.

- City: The city of the customer’s primary residence.

- Zip Code: The zip code of the customer’s primary residence.

- Lat Long: The combined latitude and longitude of the customer’s primary residence.

- Latitude: The latitude of the customer’s primary residence.

- Longitude: The longitude of the customer’s primary residence.

- Gender: The customer’s gender: Male, Female

- Senior Citizen: Indicates if the customer is 65 or older: Yes, No

- Partner: Indicate if the customer has a partner: Yes, No

- Dependents: Indicates if the customer lives with any dependents: Yes, No. Dependents could be children, parents, grandparents, etc.

- Tenure Months: Indicates the total amount of months that the customer has been with the company by the end of the quarter specified above.

- Phone Service: Indicates if the customer subscribes to home phone service with the company: Yes, No

- Multiple Lines: Indicates if the customer subscribes to multiple telephone lines with the company: Yes, No

- Internet Service: Indicates if the customer subscribes to Internet service with the company: No, DSL, Fiber Optic, Cable.

- Online Security: Indicates if the customer subscribes to an additional online security service provided by the company: Yes, No

- Online Backup: Indicates if the customer subscribes to an additional online backup service provided by the company: Yes, No

- Device Protection: Indicates if the customer subscribes to an additional device protection plan for their Internet equipment provided by the company: Yes, No

- Tech Support: Indicates if the customer subscribes to an additional technical support plan from the company with reduced wait times: Yes, No

- Streaming TV: Indicates if the customer uses their Internet service to stream television programing from a third party provider: Yes, No. The company does not charge an additional fee for this service.

- Streaming Movies: Indicates if the customer uses their Internet service to stream movies from a third party provider: Yes, No. The company does not charge an additional fee for this service.

- Contract: Indicates the customer’s current contract type: Month-to-Month, One Year, Two Year.

- Paperless Billing: Indicates if the customer has chosen paperless billing: Yes, No

- Payment Method: Indicates how the customer pays their bill: Bank Withdrawal, Credit Card, Mailed Check

- Monthly Charge: Indicates the customer’s current total monthly charge for all their services from the company.

- Total Charges: Indicates the customer’s total charges, calculated to the end of the quarter specified above.

- Churn Label: Yes = the customer left the company this quarter. No = the customer remained with the company. Directly related to Churn Value.

- Churn Value: 1 = the customer left the company this quarter. 0 = the customer remained with the company. Directly related to Churn Label.

- Churn Score: A value from 0-100 that is calculated using the predictive tool IBM SPSS Modeler. The model incorporates multiple factors known to cause churn. The higher the score, the more likely the customer will churn.

- CLTV: Customer Lifetime Value. A predicted CLTV is calculated using corporate formulas and existing data. The higher the value, the more valuable the customer. High value customers should be monitored for churn.

- Churn Reason: A customer’s specific reason for leaving the company. Directly related to Churn Category.

# Data Preprocessing

## Data Cleaning


## To get the numbers of rows and columns of the dataframe

In [4]:
df1.shape

(7043, 21)

In [5]:
df2.shape

(7043, 33)

## Descriptive data stats before cleaning NA:


In [6]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75


In [7]:
df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Count,7043.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Zip Code,7043.0,93521.964646,1865.794555,90001.0,92102.0,93552.0,95351.0,96161.0
Latitude,7043.0,36.282441,2.455723,32.555828,34.030915,36.391777,38.224869,41.962127
Longitude,7043.0,-119.79888,2.157889,-124.301372,-121.815412,-119.730885,-118.043237,-114.192901
Tenure Months,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
Monthly Charges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75
Churn Value,7043.0,0.26537,0.441561,0.0,0.0,0.0,1.0,1.0
Churn Score,7043.0,58.699418,21.525131,5.0,40.0,61.0,75.0,100.0
CLTV,7043.0,4400.295755,1183.057152,2003.0,3469.0,4527.0,5380.5,6500.0


## Descriptive data structure before cleaning NA:


In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

## Check for duplicated values


In [10]:
duplicate_rows = df1[df1.duplicated()]
print("Duplicate Rows across all columns:")
print(duplicate_rows)

Duplicate Rows across all columns:
Empty DataFrame
Columns: [customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn]
Index: []

[0 rows x 21 columns]


In [11]:
duplicate_rows = df2[df2.duplicated()]
print("Duplicate Rows across all columns:")
print(duplicate_rows)

Duplicate Rows across all columns:
Empty DataFrame
Columns: [CustomerID, Count, Country, State, City, Zip Code, Lat Long, Latitude, Longitude, Gender, Senior Citizen, Partner, Dependents, Tenure Months, Phone Service, Multiple Lines, Internet Service, Online Security, Online Backup, Device Protection, Tech Support, Streaming TV, Streaming Movies, Contract, Paperless Billing, Payment Method, Monthly Charges, Total Charges, Churn Label, Churn Value, Churn Score, CLTV, Churn Reason]
Index: []

[0 rows x 33 columns]


## Check for missing values


In [12]:
missing_values = df1.isna().any()
print("Columns with missing values:")
print(missing_values[missing_values].index)

Columns with missing values:
Index([], dtype='object')


In [13]:
missing_values = df2.isna().any()
print("Columns with missing values:")
print(missing_values[missing_values].index)

Columns with missing values:
Index(['Churn Reason'], dtype='object')
