In [93]:
#imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F


In [100]:
#reading in dataset
full_dataset = pd.read_csv("Telco.csv")
dataset = pd.read_csv("Telco.csv")

In [101]:
#Preprocessing data 

def replace_values(df):
    df = df.map(lambda x: x.replace("No internet service", "No") if isinstance(x, str) else x)
    df = df.map(lambda x: x.replace("No phone service", "No") if isinstance(x, str) else x)
    return df

dataset = replace_values(dataset)
categorical_feats = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                       'PaperlessBilling', 'PaymentMethod']  # apply one-hot encoding
dataset = dataset.drop("customerID", axis = 1)
le = LabelEncoder()
for col in categorical_feats:
    dataset[col] = le.fit_transform(dataset[col])
labels = dataset['Churn'].values  # Target variable
dataset = dataset.drop("Churn", axis = 1)
dataset[['tenure', 'MonthlyCharges', 'TotalCharges']] = dataset[['tenure', 'MonthlyCharges', 'TotalCharges']].replace(r'^\s*$', np.nan, regex=True)
dataset[['tenure', 'MonthlyCharges', 'TotalCharges']] = dataset[['tenure', 'MonthlyCharges', 'TotalCharges']].apply(pd.to_numeric, errors='coerce')
dataset[['tenure', 'MonthlyCharges', 'TotalCharges']] = dataset[['tenure', 'MonthlyCharges', 'TotalCharges']].fillna(dataset[['tenure', 'MonthlyCharges', 'TotalCharges']].mean())

print(dataset[['tenure', 'MonthlyCharges', 'TotalCharges']].isna().sum())
scaler = StandardScaler()
dataset[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(dataset[['tenure', 'MonthlyCharges', 'TotalCharges']])


tenure            0
MonthlyCharges    0
TotalCharges      0
dtype: int64


In [102]:
dataset.head(5)
print(dataset.dtypes)

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure              float64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges        float64
dtype: object


In [103]:
# Create a KNN-based graph (10 nearest neighbors)
knn = NearestNeighbors(n_neighbors=10)
knn.fit(dataset)
edges = knn.kneighbors_graph(dataset, n_neighbors=10, mode='connectivity')
edge_index = torch.tensor(edges.nonzero(), dtype=torch.long)  

labels = full_dataset['Churn'].map({'Yes': 1, 'No': 0}).values  

# Convert features and labels to tensors
x = torch.tensor(dataset.values, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)


In [104]:
class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)  # Softmax for classification

In [106]:
from sklearn.metrics import accuracy_score, f1_score

# Define cross-validation strategy (Stratified K-Fold)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store accuracy and F1-score for each fold
accuracies = []
f1_scores = []

for fold, (train_idx, test_idx) in enumerate(skf.split(dataset, labels)):
    print(f"\nFold {fold + 1}")

    # Create PyTorch Geometric Data object for this fold
    train_mask = torch.zeros(y.shape, dtype=torch.bool)
    test_mask = torch.zeros(y.shape, dtype=torch.bool)
    train_mask[train_idx] = True
    test_mask[test_idx] = True

    data = Data(x=x, edge_index=edge_index, y=y)

    # Initialize GNN model
    model = GNN(in_channels=x.shape[1], hidden_channels=16, out_channels=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fn = torch.nn.CrossEntropyLoss()

    # Train GNN
    for epoch in range(50):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)  # Forward pass
        loss = loss_fn(out[train_mask], data.y[train_mask])  # Compute loss only on training set
        loss.backward()
        optimizer.step()

    # Evaluate Model
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)

    # Compute metrics for this fold
    accuracy = accuracy_score(data.y[test_mask].numpy(), pred[test_mask].numpy())
    f1 = f1_score(data.y[test_mask].numpy(), pred[test_mask].numpy())

    print(f"Fold {fold + 1}: Accuracy = {accuracy:.4f}, F1-Score = {f1:.4f}")
    accuracies.append(accuracy)
    f1_scores.append(f1)

# Print Final Cross-Validation Scores
print("\nFinal Cross-Validation Results:")
print(f"Mean Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Mean F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")



Fold 1
Fold 1: Accuracy = 0.8091, F1-Score = 0.6096

Fold 2
Fold 2: Accuracy = 0.7970, F1-Score = 0.5879

Fold 3
Fold 3: Accuracy = 0.8190, F1-Score = 0.6244

Fold 4
Fold 4: Accuracy = 0.7891, F1-Score = 0.5438

Fold 5
Fold 5: Accuracy = 0.7884, F1-Score = 0.5512

Final Cross-Validation Results:
Mean Accuracy: 0.8005 ± 0.0119
Mean F1-Score: 0.5834 ± 0.0316


In [108]:
from sklearn.metrics import accuracy_score, f1_score

# Predicted labels
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)

# Compute metrics
accuracy = accuracy_score(data.y.numpy(), pred.numpy())
f1 = f1_score(data.y.numpy(), pred.numpy())

print(f'Accuracy: {accuracy}')

Accuracy: 0.8047706943064035
