In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

print("TensorFlow Version:", tf.__version__)

# Load the dataset
# The dataset is accessible via its contentFetchId
try:
    df = pd.read_csv("churn-bigml-80.csv")
    print("Dataset loaded successfully.")
    print(df.head())
    print(df.info())
except Exception as e:
    print(f"Error loading dataset: {e}")
    # Exit if the dataset cannot be loaded
    exit()

# Separate features (X) and target (y)
# Assuming 'churn' is the target variable
if 'churn' in df.columns:
    X = df.drop('churn', axis=1)
    y = df['churn']
    # Convert 'True.' and 'False.' to 1 and 0 for the target variable
    y = y.map({'True.': 1, 'False.': 0})
    print("\nTarget variable 'churn' identified and preprocessed.")
else:
    print("Error: 'churn' column not found in the dataset. Please check the dataset structure.")
    exit()

# Defensive check: Ensure X is a pandas DataFrame before proceeding with DataFrame-specific operations.
# The error "AttributeError: 'numpy.ndarray' object has no attribute 'select_dtypes'" indicates that
# X has unexpectedly become a NumPy array. This line ensures it's a DataFrame.
if not isinstance(X, pd.DataFrame):
    print("Warning: X was unexpectedly a NumPy array. Re-creating X as a DataFrame from df.")
    # Re-create X from df to ensure it's a DataFrame with correct column types
    X = df.drop('churn', axis=1)
    if not isinstance(X, pd.DataFrame): # Final check after re-creation
        print("Error: Failed to ensure X is a pandas DataFrame. Exiting.")
        exit()

# Identify categorical and numerical features
# Exclude 'state' and 'area code' as they might be high cardinality or not directly useful as numerical
# 'international plan' and 'voice mail plan' are categorical
categorical_features = ['state', 'area code', 'international plan', 'voice mail plan']
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# Ensure that 'state' and 'area code' are treated as categorical if they exist
for col in ['state', 'area code', 'international plan', 'voice mail plan']:
    if col in X.columns and col not in categorical_features:
        categorical_features.append(col)
    if col in numerical_features:
        numerical_features.remove(col)

# Print identified features for verification
print(f"\nNumerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples).")

# Apply preprocessing to the training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get the number of features after one-hot encoding
input_shape = X_train_processed.shape[1]
print(f"Input shape for the neural network: {input_shape} features.")

# Build the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_shape,)), # Input layer
    tf.keras.layers.Dense(128, activation='relu'), # First hidden layer
    tf.keras.layers.Dropout(0.3), # Dropout for regularization
    tf.keras.layers.Dense(64, activation='relu'),  # Second hidden layer
    tf.keras.layers.Dropout(0.3), # Dropout for regularization
    tf.keras.layers.Dense(1, activation='sigmoid') # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

# Train the model
print("\nStarting model training...")
history = model.fit(X_train_processed, y_train,
                    epochs=50, # Number of training epochs
                    batch_size=32, # Batch size for training
                    validation_split=0.2, # Use 20% of training data for validation
                    verbose=1)
print("Model training finished.")

# Evaluate the model on the test set
print("\nEvaluating model on the test set...")
loss, accuracy = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Visualize training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Visualize training and validation accuracy
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

2025-07-20 06:59:22.768408: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-20 06:59:22.769044: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-20 06:59:22.772869: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-20 06:59:22.782408: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752994762.806508   57648 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752994762.81

TensorFlow Version: 2.19.0
Error loading dataset: [Errno 2] No such file or directory: 'churn-bigml-80.csv'


NameError: name 'df' is not defined

: 