In [17]:
# Importing the necessary libraries
import pandas as pd  # For data manipulation and analysis
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn.preprocessing import MinMaxScaler  # To normalize data
from sklearn.feature_selection import RFE  # Recursive Feature Elimination (RFE) for feature selection
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.tree import DecisionTreeClassifier  # Decision Tree model
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.svm import SVC  # Support Vector Machine (SVM) model
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors (KNN) model
from sklearn.metrics import accuracy_score  # To calculate prediction accuracy

# Imports for building the neural network model with Keras
from tensorflow.keras.models import Sequential  # To create sequential models
from tensorflow.keras.layers import Dense, Dropout  # Fully connected (Dense) layers and Dropout for regularization
from tensorflow.keras.optimizers import Adam  # Adam optimizer for network training
import warnings
warnings.filterwarnings("ignore")

In [18]:
# --------------------------------------------------------------------
# 1. Dataset Loading
# --------------------------------------------------------------------
# Loads the churn dataset from a CSV file. 
# This dataset contains customer information and whether they churned or not.
df = pd.read_csv(r'C:\Users\julia\Repos\juliano\TensorFlow\Churn.csv')

In [19]:
# --------------------------------------------------------------------
# 2. Data Preprocessing
# --------------------------------------------------------------------
# Processing the 'Total Charges' column:
# - Replaces empty values ('') with 0
df['Total Charges'] = df['Total Charges'].replace('', 0)

# - Fills null values (NaN) with 0
df['Total Charges'] = df['Total Charges'].fillna(0)

# - Converts the 'Total Charges' column to numeric (float), handling errors and replacing possible NaN with 0
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce').fillna(0)

In [20]:
# --------------------------------------------------------------------
# 3. Normalization of Numerical Data
# --------------------------------------------------------------------
# Creates a MinMaxScaler object to normalize data between 0 and 1
scaler = MinMaxScaler()

# Normalizes the 'tenure', 'Monthly Charges', and 'Total Charges' columns
df[['tenure', 'Monthly Charges', 'Total Charges']] = scaler.fit_transform(
    df[['tenure', 'Monthly Charges', 'Total Charges']]
)

In [21]:
# --------------------------------------------------------------------
# 4. Data Preparation for Modeling
# --------------------------------------------------------------------
# Separates features (explanatory variables) and the target variable
# - Removes the 'Churn' column (which will be the target variable) and 'Customer ID' (not useful for prediction)
# - Converts categorical variables into dummy variables (one-hot encoding)
X = pd.get_dummies(df.drop(['Churn', 'Customer ID'], axis=1))

# Creates the target variable 'y', transforming 'Yes' into 1 and 'No' into 0
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

In [22]:
# --------------------------------------------------------------------
# 5. Splitting Data into Training and Testing Sets
# --------------------------------------------------------------------
# Splits data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# --------------------------------------------------------------------
# 6. Feature Selection Using RFE (Recursive Feature Elimination)
# --------------------------------------------------------------------
# Uses logistic regression as the base estimator for RFE
model_lr = LogisticRegression()

# Configures RFE to select the top 10 features based on their importance determined by the model
selector = RFE(model_lr, n_features_to_select=10)

# Fits (trains) RFE with the training data
selector = selector.fit(X_train, y_train)

# Identifies the column names selected by RFE
selected_columns = X_train.columns[selector.support_]
print("Selected columns:", selected_columns)

# Filters datasets to keep only the selected features
X_train_selected = X_train[selected_columns]
X_test_selected = X_test[selected_columns]

Selected columns: Index(['tenure', 'Total Charges', 'Multiple Lines_No',
       'Internet Service_Fiber optic', 'Online Security_No', 'Tech Support_No',
       'Contract_Month-to-month', 'Contract_Two year', 'Paperless Billing_No',
       'Payment Method_Electronic check'],
      dtype='object')


In [24]:
# --------------------------------------------------------------------
# 7. Function for Training and Evaluating Classic Models
# --------------------------------------------------------------------
def train_and_evaluate(model, model_name):
    """
    Trains a machine learning model, makes predictions on the test set, 
    and prints the model's accuracy.
    
    Parameters:
        model : the model object to be trained (e.g., LogisticRegression, DecisionTreeClassifier, etc.)
        model_name : name of the model (string) for displaying results
    """
    # Trains the model using the training data with selected features
    model.fit(X_train_selected, y_train)
    
    # Makes predictions on the test set
    y_hat = model.predict(X_test_selected)
    
    # Calculates accuracy by comparing predictions with actual test values
    accuracy = accuracy_score(y_test, y_hat)
    
    # Prints accuracy formatted to 4 decimal places
    print(f"Accuracy of the {model_name} model: {accuracy:.4f}")
    
    return accuracy

In [25]:
# --------------------------------------------------------------------
# 8. Training and Evaluating Classic Machine Learning Models
# --------------------------------------------------------------------
# Creates a dictionary containing the models to be tested
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Iterates over each model, trains and evaluates, displaying the accuracy of each
for name, model in models.items():
    train_and_evaluate(model, name)


Accuracy of the Logistic Regression model: 0.8013
Accuracy of the Decision Tree model: 0.7410
Accuracy of the Random Forest model: 0.7800
Accuracy of the SVM model: 0.7935
Accuracy of the KNN model: 0.7850


In [26]:
# --------------------------------------------------------------------
# 9. Building, Training, and Evaluating a Neural Network Model with Keras
# --------------------------------------------------------------------
# Creates a sequential model (linear stacking of layers)
model_nn = Sequential()

# Adds the first hidden layer:
# - 64 neurons
# - Activation function: 'relu'
# - 'input_dim' set to the number of selected features
model_nn.add(Dense(units=64, activation='relu', input_dim=len(X_train_selected.columns)))

# Adds a Dropout layer to reduce overfitting, randomly discarding 50% of neurons during training
model_nn.add(Dropout(0.5))

# Adds a second hidden layer with 128 neurons and 'relu' activation function
model_nn.add(Dense(units=128, activation='relu'))

# Adds the output layer:
# - 1 neuron, as this is a binary classification problem
# - Activation function: 'sigmoid' to produce an output between 0 and 1 (probability)
model_nn.add(Dense(units=1, activation='sigmoid'))

# Defines the Adam optimizer with a learning rate of 0.001
optimizer = Adam(learning_rate=0.001)

# Compiles the model specifying:
# - Loss function: 'binary_crossentropy', appropriate for binary classification
# - Optimizer: Adam
# - Metric: accuracy
model_nn.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Trains the neural network:
# - epochs: number of iterations over the dataset
# - batch_size: number of samples per weight update
# - validation_data: validation set to monitor performance during training
model_nn.fit(X_train_selected, y_train, epochs=200, batch_size=32, validation_data=(X_test_selected, y_test), verbose=1)

# Makes predictions on the test set using the neural network model
y_hat_nn = model_nn.predict(X_test_selected)

# Converts predictions (probabilities) into classes:
# If probability is less than 0.5, assign 0; otherwise, assign 1
y_hat_nn = [0 if val < 0.5 else 1 for val in y_hat_nn]

# Calculates and prints the neural network model's accuracy
print(f"Neural Network model accuracy: {accuracy_score(y_test, y_hat_nn):.4f}")

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200


Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200


Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200


Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
Neural Network model accuracy: 0.7970
