In [1]:
import os

import pandas as pd
import numpy as np

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

# Function for creating model pipelines - sklearn
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE


## Load Analytical Base Table

In [2]:
df = pd.read_csv(os.path.join("../Resources", "analytical_base_table.csv"))
print(f"Dataframe dimensions: {df.shape}")
df.head()

Dataframe dimensions: (10000, 11)


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Separate dataframe into separate object

In [3]:
# Object for target variable
y = df.Exited

# object for input features
X = df.drop(['Exited'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(10000, 10) (10000,)


In [4]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [5]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['Geography', 'Gender']

In [6]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    dff = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    dff['Count'] = dff['Count'].astype('int64')
    dff['%'] = round(dff['Count'] / a.shape[0] * 100, 2)
    return dff.sort_values('Count',ascending=False)

In [7]:
class_count(y)

Unnamed: 0,Exited,Count,%
1,0,7963,79.63
0,1,2037,20.37


## Create a Train Test Split

In [8]:
random_state = 10

# Split X and y into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=random_state,
                                                   stratify=df.Exited)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

7000 3000 7000 3000


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 8061 to 4741
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      7000 non-null   int64  
 1   Geography        7000 non-null   object 
 2   Gender           7000 non-null   object 
 3   Age              7000 non-null   int64  
 4   Tenure           7000 non-null   int64  
 5   Balance          7000 non-null   float64
 6   NumOfProducts    7000 non-null   int64  
 7   HasCrCard        7000 non-null   int64  
 8   IsActiveMember   7000 non-null   int64  
 9   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB


## Pre-processing Pipeline

### Scale numerical data and encode categorical data
Construct a pre-processing pipeline from the given transformers: MinMaxScaler and OneHotEncoder

Create lists of indexes from the list of column names

Need to be numeric not string to specify columns name in column transformer

In [10]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

[0, 3, 4, 5, 6, 7, 8, 9]


In [11]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features)  

[1, 2]


In [12]:
# Define column transformer
# Need to be numeric not string to specify columns name 
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse=False), cat_features)
)
preprocess

ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                 [0, 3, 4, 5, 6, 7, 8, 9]),
                                ('onehotencoder', OneHotEncoder(sparse=False),
                                 [1, 2])])

## Build Model Pipeline with SMOTE

* We are going to use the Pipeline from the imblearn package in place of scikit-learn Pipeline.

* It takes care automatically to re-sample when called fit() on the pipeline, and does not re-sample test data (when called transform() or predict()).

In [13]:
# Import classifier
from sklearn.svm import SVC 

# Define model with pipeline
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  SVC(random_state=random_state))

model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  [0, 3, 4, 5, 6, 7, 8, 9]),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  [1, 2])])),
                ('smote', SMOTE(random_state=10)),
                ('svc', SVC(random_state=10))])

In [14]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__kernel' : ['linear', 'rbf', 'poly', 'sigmoid'],
              'svc__C': [0.0005,0.001, 0.01, 0.1, 0.5],
              'svc__gamma': [5, 1, 0.1, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3, cv= 5, n_jobs=4, scoring='accuracy')

In [15]:
X_train = X_train.values
X_test = X_test.values

In [16]:
print(type(X_train))  # Should print: <class 'pandas.core.frame.DataFrame'>


<class 'numpy.ndarray'>


In [None]:
# Train the model with GridSearch
grid.fit(X_train, y_train)


Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [None]:
print(grid.best_params_)

In [None]:
 # List the best score
print(grid.best_score_)

In [None]:
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

In [None]:
import matplotlib.pyplot as plt

# Data
scores = [grid.score(X_train, y_train) * 100, grid.score(X_test, y_test) * 100]
labels = ['Training Data', 'Testing Data']

# Create a figure with smaller size
plt.figure(figsize=(6, 2))  # Adjust the width and height as needed

# Plot with pastel pink color for the line
plt.plot(labels, scores, marker='o', color='#F7934C', linestyle='-', linewidth=3, markersize=8)

# Set the Y-axis limits to center the line
plt.ylim(min(scores) - 10, max(scores) + 10)  # Adjust this to make sure the line is centered within the plot

# Add labels and title
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracy: Training vs Testing')

# Adding the text on the plot
for i in range(len(scores)):
    plt.text(labels[i], scores[i] + 2, f'{int(scores[i])}%', ha='center', va='bottom', fontsize=12)

plt.show()


In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)
predictions

In [None]:
# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, predictions)
print(cm)

In [None]:
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Example normalized confusion matrix
cm = np.array([[0.83, 0.17], 
               [0.32, 0.68]])

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='.2f', cmap=sns.light_palette("#F7934C", as_cmap=True), cbar=False)

# Add labels, title, and axes
plt.title('Normalized Confusion Matrix', fontsize=16)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.xticks(ticks=[0.5, 1.5], labels=['Class 0', 'Class 1'], fontsize=10)
plt.yticks(ticks=[0.5, 1.5], labels=['Class 0', 'Class 1'], fontsize=10, rotation=0)

plt.show()


In [None]:
print(classification_report(y_test, predictions))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report

# Sample classification report data
report = classification_report(y_test, predictions, output_dict=True)

# Extract precision, recall, and f1-score for each class
precision_class_0 = report['0']['precision']
recall_class_0 = report['0']['recall']
f1_class_0 = report['0']['f1-score']

precision_class_1 = report['1']['precision']
recall_class_1 = report['1']['recall']
f1_class_1 = report['1']['f1-score']

# Set up the positions for each bar
metrics = ['Precision', 'Recall', 'F1-Score']
class_0_values = [precision_class_0, recall_class_0, f1_class_0]
class_1_values = [precision_class_1, recall_class_1, f1_class_1]

# Bar colors
class_0_color = '#F7934C'  # Purple for class 0
class_1_color = '#A5B452'  # Olive for class 1

# Set the figure size
plt.figure(figsize=(10, 6))

# Create horizontal bars
bar_width = 0.2  # Reduced width of bars
index = np.arange(len(metrics))  # Position for each metric

# Plot bars for class 0 and class 1, side by side
bars_class_0 = plt.barh(index - bar_width / 2, class_0_values, bar_width, label='Customers who did not churn.', color=class_0_color)
bars_class_1 = plt.barh(index + bar_width / 2, class_1_values, bar_width, label='Customers who did churn.', color=class_1_color)

# Add labels and title
plt.xlabel('Score')
plt.title('Classification Report: Precision, Recall, F1-Score by Class')
plt.yticks(index, metrics)  # Set y-axis to show metrics
plt.legend()

# Adding text labels for the values on top of the bars
for i, bar in enumerate(bars_class_0):
    # Add value for class 0
    plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2, 
             f'{class_0_values[i]:.2f}', va='center', ha='left', fontsize=12, color='black')

for i, bar in enumerate(bars_class_1):
    # Add value for class 1
    plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2, 
             f'{class_1_values[i]:.2f}', va='center', ha='left', fontsize=12, color='black')

# Display the plot
plt.show()


In [None]:
predictions

In [None]:
pred = grid.predict(X_test[:1])

In [None]:
print(f"Predicted classes: {pred}")
print(f"Actual Labels: {list(y_test[:1])}")

## Save the Model

In [None]:
import joblib

# We are saving our grid model
filename = '../models/SVM_model.sav'
joblib.dump(grid, filename)

## Loading the Model

In [None]:
# load the model
svm_model = joblib.load(filename)
print(svm_model.score(X_test, y_test))

### Predict class for new data

In [None]:
# Let's use the first X_test record as new data
X_test[:1]

In [None]:
pred_new = grid.predict(X_test[:1])

In [None]:
print(f"Predicted classes: {pred_new}")
print(f"Actual Labels: {list(y_test[:1])}")

In [None]:
from sklearn.svm import SVC

# Set probability=True to enable predict_proba
svc = SVC(probability=True)

# Train your model (assuming grid is your grid search object)
grid = GridSearchCV(svc, param_grid, cv=5)  # Make sure to define your param_grid
grid.fit(X_train, y_train)


In [None]:
from sklearn.svm import SVC

# Set probability=True to enable predict_proba
svc = SVC(probability=True)

# Train your model (assuming grid is your grid search object)
grid = GridSearchCV(svc, param_grid, cv=5)  # Define your param_grid accordingly
grid.fit(X_train, y_train)

# Predict probabilities for the positive class (class 1)
fpr, tpr, thresholds = roc_curve(y_test, grid.predict_proba(X_test)[:, 1])

# Calculate AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='#F7934C', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='#60435F', lw=2, linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - SVC')
plt.legend(loc="lower right")
plt.show()


In [83]:
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# Predict probabilities for the positive class (class 1)
precision, recall, _ = precision_recall_curve(y_test, grid.predict_proba(X_test)[:, 1])

# Calculate the Average Precision (AUC)
pr_auc = average_precision_score(y_test, grid.predict_proba(X_test)[:, 1])

# Plot Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='#F7934C', lw=2, label='PR curve (AUC = %0.2f)' % pr_auc)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - SVC')
plt.legend(loc="lower left")
plt.show()


AttributeError: predict_proba is not available when  probability=False

In [84]:
import numpy as np
import matplotlib.pyplot as plt

def plot_feature_importance_svm(model, original_num_features, original_cat_features, X_train):
    """
    Plots a feature importance graph for a trained Support Vector Machine (SVM) model.

    Parameters:
    model : Trained SVM model (GridSearchCV object)
    original_num_features : List of original numerical feature names
    original_cat_features : List of original categorical feature names
    X_train : Training data (used to fit the preprocessor)
    """
    # Access the pipeline
    pipeline = model.best_estimator_

    # Extract the preprocessor and SVM model components
    preprocessor = pipeline.named_steps['columntransformer']
    svm_model = pipeline.named_steps['svc']

    # Get the transformed feature names from the preprocessor
    # Use `get_feature_names_out` for one-hot encoded features
    num_feature_names = original_num_features
    cat_feature_names = preprocessor.named_transformers_['onehotencoder'].get_feature_names_out(original_cat_features)

    # Combine numerical and one-hot encoded categorical feature names
    all_feature_names = np.concatenate([num_feature_names, cat_feature_names])

    # Extract coefficients from the SVM model (for linear SVM only)
    coefficients = np.abs(svm_model.coef_).flatten()

    # Normalize importance to percentage
    importance = 100 * (coefficients / np.sum(coefficients))

    # Sort features by importance
    sorted_idx = np.argsort(importance)[::-1]
    sorted_features = all_feature_names[sorted_idx]
    sorted_importance = importance[sorted_idx]

    # Plot the feature importance
    plt.figure(figsize=(12, 8))
    plt.bar(sorted_features, sorted_importance, color='#F7934C')
    plt.xlabel("Features")
    plt.ylabel("Importance (%)")
    plt.title("Feature Importance in SVM")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Call the function with necessary arguments
plot_feature_importance_svm(grid, num_columns, cat_columns, X_train)


AttributeError: coef_ is only available when using a linear kernel