In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Telco-Customer-Churn.csv')
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [3]:
target = data['Churn']
data = data.drop(['customerID','Churn'],axis=1)

In [4]:
data['TotalCharges'] = data['TotalCharges'].replace(' ',np.nan).astype(float)

In [5]:
obj = data.select_dtypes(include=object).columns
num = data.select_dtypes(exclude=object).columns
obj

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [6]:
z = list(num)[1:]
z

['tenure', 'MonthlyCharges', 'TotalCharges']

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [8]:
sc.fit(data[z])
data[z] = sc.transform(data[z])

In [9]:
for i in num:
    data[i].fillna(data[i].mean(), inplace=True)

In [10]:
for i in data.columns[data.isna().sum()>0]:
    data[i].fillna(None, method='ffill',inplace=True)

In [11]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

for i in data.select_dtypes(include=object):
        data[i]= enc.fit_transform(data[i])

target = enc.fit_transform(target)

In [12]:
data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [13]:
from sklearn.model_selection import train_test_split

xtr, xtst, ytr, ytst = train_test_split(data, target, test_size=0.3)

In [14]:
from sklearn.svm import SVC

svc = SVC()


In [15]:
svc.fit(xtr,ytr)
svc.score(xtst,ytst)

0.8149550402271651

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier()


In [17]:
rf.fit(xtr,ytr)
rf.score(xtst,ytst)

0.804070042593469

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()


In [19]:
gb.fit(xtr,ytr)
gb.score(xtst,ytst)

0.8239469947941316

In [20]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()


In [21]:
dt.fit(xtr,ytr)
dt.score(xtst,ytst)

0.751538097491718

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

## SVC

In [23]:
svm_param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient
}
svm_grid_search = GridSearchCV(svc, svm_param_grid, cv=3)
svm_grid_search.fit(xtr, ytr)
svm_best_model = svm_grid_search.best_estimator_
svm_predictions = svm_best_model.predict(xtst)
svm_accuracy = accuracy_score(ytst, svm_predictions)
print(f"SVM Accuracy: {svm_accuracy}")
print(f"Best SVM Parameters (GridSearchCV): {svm_grid_search.best_params_}")

SVM Accuracy: 0.8149550402271651
Best SVM Parameters (GridSearchCV): {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


In [24]:
svm_param_dist = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient
}
svm_random_search = RandomizedSearchCV(svc, svm_param_dist, n_iter=10, cv=3, random_state=42)
svm_random_search.fit(xtr, ytr)
svm_best_model_random = svm_random_search.best_estimator_
svm_predictions_random = svm_best_model_random.predict(xtst)
svm_accuracy_random = accuracy_score(ytst, svm_predictions_random)
print(f"SVM Accuracy (RandomizedSearchCV): {svm_accuracy_random}")
print(f"Best SVM Parameters (RandomizedSearchCV): {svm_random_search.best_params_}")

SVM Accuracy (RandomizedSearchCV): 0.8149550402271651
Best SVM Parameters (RandomizedSearchCV): {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}


### Decision Tree

In [25]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],  # The function to measure the quality of a split
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}
dt_grid_search = GridSearchCV(dt, dt_param_grid, cv=3)
dt_grid_search.fit(xtr, ytr)
dt_best_model = dt_grid_search.best_estimator_
dt_predictions = dt_best_model.predict(xtst)
dt_accuracy = accuracy_score(ytst, dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")
print(f"Best Decision Tree Parameters (GridSearchCV): {dt_grid_search.best_params_}")

Decision Tree Accuracy: 0.7761476573592049
Best Decision Tree Parameters (GridSearchCV): {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [26]:
dt_param_dist = {
    'criterion': ['gini', 'entropy'],  
    'max_depth': [None, 10, 20],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}
dt_random_search = RandomizedSearchCV(dt, dt_param_dist, n_iter=10, cv=3, random_state=42)
dt_random_search.fit(xtr, ytr)
dt_best_model_random = dt_random_search.best_estimator_
dt_predictions_random = dt_best_model_random.predict(xtst)
dt_accuracy_random = accuracy_score(ytst, dt_predictions_random)
print(f"Decision Tree Accuracy (RandomizedSearchCV): {dt_accuracy_random}")
print(f"Best Decision Tree Parameters (RandomizedSearchCV): {dt_random_search.best_params_}")

Decision Tree Accuracy (RandomizedSearchCV): 0.7879791765262659
Best Decision Tree Parameters (RandomizedSearchCV): {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10, 'criterion': 'gini'}


## Random Forest

In [27]:
rf_param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2']  # Number of features to consider when looking for the best split
}
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=3)
rf_grid_search.fit(xtr, ytr)
rf_best_model = rf_grid_search.best_estimator_
rf_predictions = rf_best_model.predict(xtst)
rf_accuracy = accuracy_score(ytst, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Best Random Forest Parameters (GridSearchCV): {rf_grid_search.best_params_}")

Random Forest Accuracy: 0.8045433033601515
Best Random Forest Parameters (GridSearchCV): {'max_features': 'sqrt', 'n_estimators': 50}


In [28]:
rf_param_dist = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2']  # Number of features to consider when looking for the best split
}
rf_random_search = RandomizedSearchCV(rf, rf_param_dist, n_iter=10, cv=3, random_state=42)
rf_random_search.fit(xtr, ytr)
rf_best_model_random = rf_random_search.best_estimator_
rf_predictions_random = rf_best_model_random.predict(xtst)
rf_accuracy_random = accuracy_score(ytst, rf_predictions_random)
print(f"Random Forest Accuracy (RandomizedSearchCV): {rf_accuracy_random}")
print(f"Best Random Forest Parameters (RandomizedSearchCV): {rf_random_search.best_params_}")



Random Forest Accuracy (RandomizedSearchCV): 0.8069096071935636
Best Random Forest Parameters (RandomizedSearchCV): {'n_estimators': 50, 'max_features': 'log2'}


## Gradient Boosting

In [29]:
gb_param_grid = {
    'n_estimators': [50, 100, 150],  
    'learning_rate': [0.1, 0.5, 1.0],  
    'max_depth': [3, 5, 8]  
}

gb_grid_search = GridSearchCV(gb, gb_param_grid, cv=3)
gb_grid_search.fit(xtr, ytr)

# Get the best model and make predictions
gb_best_model = gb_grid_search.best_estimator_
gb_predictions = gb_best_model.predict(xtst)
gb_accuracy = accuracy_score(ytst, gb_predictions)

# Print results
print(f"Gradient Boosting Accuracy: {gb_accuracy}")
print(f"Best Gradient Boosting Parameters (GridSearchCV): {gb_grid_search.best_params_}")


Gradient Boosting Accuracy: 0.8220539517274018
Best Gradient Boosting Parameters (GridSearchCV): {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


In [33]:
gb_param_dist = {
    'n_estimators': [50, 100, 150],  
    'learning_rate': [0.1, 0.5, 1.0],  
    'max_depth': [3, 5, 8]  
}

# Perform Randomized Search with Gradient Boosting
gb_random_search = RandomizedSearchCV(gb, param_distributions=gb_param_dist, cv=3, n_iter=100)
gb_random_search.fit(xtr, ytr)

# Get the best model and make predictions
gb_best_model = gb_random_search.best_estimator_
gb_predictions = gb_best_model.predict(xtst)
gb_accuracy = accuracy_score(ytst, gb_predictions)

# Print results
print(f"Gradient Boosting Accuracy (RandomizedSearchCV): {gb_accuracy}")
print(f"Best Gradient Boosting Parameters (RandomizedSearchCV): {gb_random_search.best_params_}")


