In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
drive.mount('/content/drive')

In [None]:
# Assuming that train.csv and test.csv are in the root of Google Drive
train_path = '/content/drive/My Drive/train.csv'
test_path = '/content/drive/My Drive/test.csv'

In [None]:
# Read the CSV files into pandas dataframes
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
original_test_df = pd.read_csv(test_path)

In [None]:
# Convert the dataframes into numpy arrays
train_array = train_df.values
test_array = test_df.values

In [None]:
print(train_array.shape)

(5343, 21)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer


# Drop the customerID column as it's not useful for prediction
train_df = train_df.drop('customerID', axis=1)

# Fill in missing values of TotalCharges before encoding and scaling
train_df['TotalCharges'].fillna(train_df['TotalCharges'].median(), inplace=True)

# Dictionary to hold the LabelEncoders
label_encoders = {}

# Encode categorical features
categorical_features = [column for column in train_df.columns if train_df[column].dtype == 'object']  # type object means probably string text
for column in categorical_features: # categorical_features are features that can are deemed categorical
    le = LabelEncoder() # Converts categorical text data into a numerical format where each unique category/value in the column is assigned an integer
    train_df[column] = le.fit_transform(train_df[column]) # Learns what integers should be assigned to each unique value per column and reassigns the column
    label_encoders[column] = le  # Save the encoder for this column

# Separate features and target
X = train_df.drop('Discontinued', axis=1)
y = train_df['Discontinued']

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (to prevent feature bias)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Allows scalar to learn the scaling parameters (mean and standard deviatino) of the training set, which then scales training data to standard
X_val_scaled = scaler.transform(X_val)          # Uses same scaling parameters learned from training set

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600],  # Number of trees in the random forest
    'max_depth': [10, 20, 30, 40, 50, None],  # Maximum number of levels in tree
    'min_samples_split': [2, 5, 10, 15, 20, 30],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4, 6, 7, 8],  # Minimum number of samples required at each leaf node
}

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Create the grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Perform the grid search and fit the model
grid_search.fit(X_train_scaled, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict probabilities for the validation set with the best model
y_pred_probs_best = best_rf.predict_proba(X_val_scaled)[:, 1]

# Calculate ROC AUC with the best model
roc_auc_best = roc_auc_score(y_val, y_pred_probs_best)
print(f'Best ROC AUC Score: {roc_auc_best}')


In [None]:
# Drop the customerID column as it's not useful for prediction
train_df = train_df.drop('customerID', axis=1)

# Fill in missing values of TotalCharges before encoding and scaling
train_df['TotalCharges'].fillna(train_df['TotalCharges'].median(), inplace=True)

# Dictionary to hold the LabelEncoders
label_encoders = {}

# Encode categorical features
categorical_features = [column for column in train_df.columns if train_df[column].dtype == 'object']  # type object means probably string text
for column in categorical_features: # categorical_features are features that can are deemed categorical
    le = LabelEncoder() # Converts categorical text data into a numerical format where each unique category/value in the column is assigned an integer
    train_df[column] = le.fit_transform(train_df[column]) # Learns what integers should be assigned to each unique value per column and reassigns the column
    label_encoders[column] = le  # Save the encoder for this column

# Separate features and target
X = train_df.drop('Discontinued', axis=1)
y = train_df['Discontinued']

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (to prevent feature bias)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Allows scalar to learn the scaling parameters (mean and standard deviatino) of the training set, which then scales training data to standard
X_val_scaled = scaler.transform(X_val)          # Uses same scaling parameters learned from training set

# Initialize and train the Random Forest Classifier with more trees and a specific maximum depth
rf_tuned = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_tuned.fit(X_train_scaled, y_train)

# Predict probabilities for the validation set with the tuned model
y_pred_probs_tuned = rf_tuned.predict_proba(X_val_scaled)[:, 1]

# Calculate ROC AUC with the tuned model
roc_auc_tuned = roc_auc_score(y_val, y_pred_probs_tuned)
print(f'Tuned ROC AUC Score: {roc_auc_tuned}')



In [None]:
print(categorical_features[:-1])

In [None]:
# TESTING
# Drop the customerID column as it's not useful for prediction
test_df = test_df.drop('customerID', axis=1)

# Fill in missing values of TotalCharges
test_df['TotalCharges'].fillna(test_df['TotalCharges'].median(), inplace=True)

# Exclude 'Discontinued' when listing categorical features
categorical_features = [column for column in categorical_features if column != 'Discontinued']

# Use saved LabelEncoders to transform categorical features in the test set
for column in categorical_features:
    le = label_encoders[column]  # Retrieve the saved encoder for this column
    test_df[column] = le.transform(test_df[column])  # Transform test data using the saved encoder

# Scale the features of the test set using the same scaler as the training set
X_test = test_df  # No need to drop 'Discontinued' as it's not present
X_test_scaled = scaler.transform(X_test)
# Predict probabilities for the test set
y_test_pred_probs = rf_tuned.predict_proba(X_test_scaled)[:, 1]


In [None]:
print(y_test_pred_probs)

In [None]:
# Output CSV
# Create a new dataframe with customerID and the predicted probabilities
output_df = pd.DataFrame({
    'ID': original_test_df['customerID'],
    'TARGET': y_test_pred_probs
})
# Save the new dataframe to a CSV file
output_df.to_csv('predicted_probabilities.csv', index=False)