In [55]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.validation import check_is_fitted

# Load the data
X_train = pd.read_csv(f'{BASE_URL}/train.csv')
X_test = pd.read_csv(f'{BASE_URL}/test.csv')

# Target column (assuming it's in your train data)
y = X_train['target_class']
X_train = X_train.drop('target_class', axis=1)

# Train-test split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y, test_size=0.2, random_state=42)

# 1. Define the PyTorch Model (Neural Network)
class ChurnNet(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_hidden_layers=2, dropout_rate=0.3):
        super(ChurnNet, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_size, hidden_size))
        for _ in range(num_hidden_layers - 1):
            self.layers.append(nn.Linear(hidden_size, hidden_size))
        self.output = nn.Linear(hidden_size, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        for layer in self.layers:
            x = self.relu(layer(x))
            x = self.dropout(x)
        x = self.sigmoid(self.output(x))
        return x

# 2. Custom Pytorch Estimator to integrate with the sklearn pipeline
class PytorchModel(BaseEstimator, TransformerMixin):
    def __init__(self, input_size, hidden_size, num_hidden_layers, dropout_rate, lr, num_epochs, batch_size=64):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.dropout_rate = dropout_rate
        self.lr = lr
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.build_model()
    
    def build_model(self):
        layers = []
        layers.append(nn.Linear(self.input_size, self.hidden_size))  # Adjusted input size
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(self.dropout_rate))

        for _ in range(self.num_hidden_layers - 1):
            layers.append(nn.Linear(self.hidden_size, self.hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(self.dropout_rate))

        layers.append(nn.Linear(self.hidden_size, 1))
        layers.append(nn.Sigmoid())
        
        self.model = nn.Sequential(*layers)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.criterion = nn.BCELoss()

    def fit(self, X, y):
        dataset = TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.num_epochs):
            self.model.train()
            running_loss = 0.0
            for batch_X, batch_y in loader:
                self.optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = self.criterion(outputs.squeeze(), batch_y)
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()

            if epoch % 5 == 0:
                print(f"Epoch {epoch+1}/{self.num_epochs}, Loss: {running_loss / len(loader):.4f}")

    def transform(self, X):
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.float32))
            return outputs.squeeze().round().numpy()

# 3. Preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('imputer', SimpleImputer(strategy='median'), X_train.columns),
    ('scaler', StandardScaler(), X_train.columns)
])

# 4. SMOTE integration (Modified to accept both X and y during fit_transform)
class SMOTETransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.smote = SMOTE(random_state=42)

    def fit(self, X, y):
        self.X_resampled, self.y_resampled = self.smote.fit_resample(X, y)
        return self

    def transform(self, X):
        return self.X_resampled

    def fit_resample(self, X, y):
        return self.X_resampled, self.y_resampled

    def get_y(self):
        return self.y_resampled

# 5. Define a custom pipeline to ensure X and y are handled correctly
class CustomPipeline(Pipeline):
    def fit(self, X, y=None, **fit_params):
        # Extract y after SMOTE and pass it to the model
        Xt, yt = X, y
        for name, transform in self.steps[:-1]:
            Xt = transform.fit_transform(Xt, yt) if name == 'smote' else transform.fit_transform(Xt)
            if isinstance(transform, SMOTETransformer):
                yt = transform.get_y()
        self.steps[-1][-1].fit(Xt, yt)
        return self

    def predict(self, X):
        Xt = X
        for name, transform in self.steps[:-1]:
            Xt = transform.transform(Xt)
        return self.steps[-1][-1].transform(Xt)

# 6. Define the full pipeline with preprocessing, SMOTE, and model training
pipeline = CustomPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTETransformer()),  # SMOTE for class imbalance handling
     ('pytorch_model', PytorchModel(input_size=X_train_resampled.shape[1], hidden_size=128, num_hidden_layers=2, dropout_rate=0.3, lr=1e-3, num_epochs=20))
])

# 7. Train the pipeline
pipeline.fit(X_train_split, y_train_split)

# 8. Evaluate the pipeline on the validation data
y_val_pred = pipeline.predict(X_val_split)

# 9. Calculate the Matthews Correlation Coefficient (MCC)
mcc_val_score = matthews_corrcoef(y_val_split, y_val_pred)
print(f'Matthews Correlation Coefficient (MCC) on validation data: {mcc_val_score:.4f}')

# 10. Make Predictions on Test Data
y_test_pred = pipeline.predict(X_test)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x96 and 48x128)

In [56]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.validation import check_is_fitted

In [57]:
BASE_URL = '/kaggle/input/test-task-for-ds-customer-churn-predict-2024-10/archive'

In [58]:
# Load the data
X_train = pd.read_csv(f'{BASE_URL}/train.csv')
X_test = pd.read_csv(f'{BASE_URL}/test.csv')

In [59]:
# Target column
y = X_train['target_class']

In [60]:
X_train.drop('target_class',axis=1, inplace=True)

In [63]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [64]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y)

In [65]:
# Split the training data into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.2, random_state=42)

In [66]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_split, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_split.values, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val_split, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_split.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

In [67]:
# Update the DataLoader for the new training set
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

In [77]:
# 2. Define the Neural Network Model
class ChurnNet(nn.Module):
    def __init__(self, input_size):
        super(ChurnNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)  # Increased to 256 neurons
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)  # Dropout regularization
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.sigmoid(self.fc3(x))
        return x

In [78]:
# Initialize the model
input_size = X_train.shape[1]
model = ChurnNet(input_size)

In [83]:
# 3. Loss Function and Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)

num_epochs = 50  # Train for more epochs
best_mcc = -1
patience = 10  # Increase patience

In [84]:
# # 4. Train the Model
# num_epochs = 20
# best_mcc = -1  # Initialize best MCC score
# patience = 5   # Number of epochs to wait for improvement before stopping
# counter = 0

In [85]:
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

    # Validation phase
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_tensor).squeeze().round().numpy()
        mcc_val_score = matthews_corrcoef(y_val_tensor.numpy(), val_preds)
        print(f'MCC on validation set after epoch {epoch+1}: {mcc_val_score:.4f}')

    if mcc_val_score > best_mcc:
        best_mcc = mcc_val_score
        counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        counter += 1
        if counter >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break

    # Step the scheduler
    scheduler.step(mcc_val_score)

Epoch [1/50], Loss: 0.1823
MCC on validation set after epoch 1: 0.8583
Epoch [2/50], Loss: 0.1798
MCC on validation set after epoch 2: 0.8543
Epoch [3/50], Loss: 0.1871
MCC on validation set after epoch 3: 0.8574
Epoch [4/50], Loss: 0.1786
MCC on validation set after epoch 4: 0.8591
Epoch [5/50], Loss: 0.1742
MCC on validation set after epoch 5: 0.8599
Epoch [6/50], Loss: 0.1724
MCC on validation set after epoch 6: 0.8659
Epoch [7/50], Loss: 0.1726
MCC on validation set after epoch 7: 0.8724
Epoch [8/50], Loss: 0.1695
MCC on validation set after epoch 8: 0.8699
Epoch [9/50], Loss: 0.1693
MCC on validation set after epoch 9: 0.8764
Epoch [10/50], Loss: 0.1662
MCC on validation set after epoch 10: 0.8786
Epoch [11/50], Loss: 0.1667
MCC on validation set after epoch 11: 0.8668
Epoch [12/50], Loss: 0.1640
MCC on validation set after epoch 12: 0.8640
Epoch [13/50], Loss: 0.1661
MCC on validation set after epoch 13: 0.8732
Epoch [14/50], Loss: 0.1439
MCC on validation set after epoch 14: 0.8

In [86]:
model.load_state_dict(torch.load('best_model.pth')) 
model.eval()

with torch.no_grad():
    val_preds = model(X_val_tensor).squeeze().round().numpy()
    mcc_val_score = matthews_corrcoef(y_val_tensor.numpy(), val_preds)
    print(f'Best Matthews Correlation Coefficient (MCC) on validation data: {mcc_val_score:.4f}')

Best Matthews Correlation Coefficient (MCC) on validation data: 0.9091


  model.load_state_dict(torch.load('best_model.pth'))


In [88]:
# 6. Make Predictions on Test Data
with torch.no_grad():
    test_preds = model(X_test_tensor).squeeze().round().numpy()

In [89]:
client_ids = X_test.index 
results = pd.DataFrame({
    'ID': client_ids,
    'target': test_preds
})
results.to_csv('/kaggle/working/submission.csv', index=False)