# Import the data and libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from pygam import LogisticGAM

In [3]:
file_path = "./data/Dataset_10Percent.xlsx"
dataset = pd.read_excel(file_path)
print(dataset.head())


         ID  DemAffl  DemAge DemClusterGroup DemGender    DemReg DemTVReg  \
0  17147654      5.0     NaN             NaN       NaN       NaN      NaN   
1   8415498     15.0     NaN             NaN         M       NaN      NaN   
2  12107603      NaN     NaN             NaN         M  Midlands     East   
3  14400995      8.0    28.0             NaN         F       NaN      NaN   
4  28724674     14.0    67.0             NaN       NaN       NaN      NaN   

  LoyalClass  LoyalSpend  LoyalTime  TargetBuy  
0        Tin        0.01        5.0          0  
1       Gold     8000.00        5.0          1  
2        Tin        0.01        NaN          1  
3        Tin        0.01        NaN          1  
4        Tin        0.01        7.0          0  


In [24]:
dataset.shape
# we have 22223 rows and 11 colums

(22223, 11)

In [25]:
dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,17147654,5.0,,,,,,Tin,0.01,5.0,0
1,8415498,15.0,,,M,,,Gold,8000.0,5.0,1
2,12107603,,,,M,Midlands,East,Tin,0.01,,1
3,14400995,8.0,28.0,,F,,,Tin,0.01,,1
4,28724674,14.0,67.0,,,,,Tin,0.01,7.0,0


- ID: customer ID
- DemAffl: affluence grade on a scale from 1 to 30
- DemAge: age, in years
- DemCLusterGroup: neighborhood grouo
- DemGender: M/F/unkoun
- DemReg: Geographic region
- DemTVReg: tv region
- LoyalClass: loyalty status (tin, silver, gold, platinum)
- LoyalSpend: total amount spent
- LoyalTime: time as a loyalty card member
- Target buy: purchased? yes/no

# Preparing the dataset

In [4]:
# Drop customer ID as it is a leaker variable
dataset=dataset.drop(['ID'],axis=1)
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,,,,,,Tin,0.01,5.0,0
1,15.0,,,M,,,Gold,8000.0,5.0,1
2,,,,M,Midlands,East,Tin,0.01,,1
3,8.0,28.0,,F,,,Tin,0.01,,1
4,14.0,67.0,,,,,Tin,0.01,7.0,0


In [27]:
# look at NAs
dataset.isna().sum()

DemAffl            1085
DemAge             1508
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
LoyalClass            0
LoyalSpend            0
LoyalTime           281
TargetBuy             0
dtype: int64

In [5]:
# due to the high number of NAs and the importance of the variables we will fill them with mean/mode*

dataset['DemAffl']=dataset['DemAffl'].fillna(dataset['DemAffl'].mode()[0])
dataset['DemAge']=dataset['DemAge'].fillna(dataset['DemAge'].mode()[0])
dataset['DemClusterGroup']=dataset['DemClusterGroup'].fillna(dataset['DemClusterGroup'].mode()[0])
dataset['DemGender']=dataset['DemGender'].fillna(dataset['DemGender'].mode()[0])
dataset['DemReg']=dataset['DemReg'].fillna(dataset['DemReg'].mode()[0])
dataset['DemTVReg']=dataset['DemTVReg'].fillna(dataset['DemTVReg'].mode()[0])
dataset['LoyalTime']=dataset['LoyalTime'].fillna(dataset['LoyalTime'].mean())

In [6]:
# check that all valued have been converted
dataset.isna().sum()

DemAffl            0
DemAge             0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
LoyalClass         0
LoyalSpend         0
LoyalTime          0
TargetBuy          0
dtype: int64

# Converting categorial variables to numeric

We convert categorical variables to numerical format for Machine Learning (ML) in Python because most ML algorithms require numerical input to calculate distances, correlations, and other mathematical operations.

In [7]:
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()

dataset['DemClusterGroup'] = number.fit_transform(dataset['DemClusterGroup'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemGender'] = number.fit_transform(dataset['DemGender'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemReg'] = number.fit_transform(dataset['DemReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemTVReg'] = number.fit_transform(dataset['DemTVReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['LoyalClass'] = number.fit_transform(dataset['LoyalClass'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'U': 6}
{'F': 0, 'M': 1, 'U': 2}
{'Midlands': 0, 'North': 1, 'Scottish': 2, 'South East': 3, 'South West': 4}
{'Border': 0, 'C Scotland': 1, 'East': 2, 'London': 3, 'Midlands': 4, 'N East': 5, 'N Scot': 6, 'N West': 7, 'S & S East': 8, 'S West': 9, 'Ulster': 10, 'Wales & West': 11, 'Yorkshire': 12}
{'Gold': 0, 'Platinum': 1, 'Silver': 2, 'Tin': 3}


In [31]:
# check they have been transformed
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,51.0,2,0,3,3,3,0.01,5.0,0
1,15.0,51.0,2,1,3,3,0,8000.0,5.0,1
2,8.0,51.0,2,1,0,2,3,0.01,6.56467,1
3,8.0,28.0,2,0,3,3,3,0.01,6.56467,1
4,14.0,67.0,2,0,3,3,3,0.01,7.0,0


# Check for multicollinearity
We check for multicollinearity to ensure that independent variables in a model are not highly correlated, as this can distort the model's coefficients and reduce its predictive accuracy.
In this case we will use thr VIF coefficient.

In [33]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(z):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = z.columns
    vif["VIF"] = [variance_inflation_factor(z.values, i) for i in range(z.shape[1])]

    return(vif)

z = dataset.iloc[:,0:10]
calc_vif(z)
# all levels are under the desiderable levels of 10 so all is good to proceed

Unnamed: 0,variables,VIF
0,DemAffl,7.549453
1,DemAge,11.18571
2,DemClusterGroup,3.672943
3,DemGender,1.469044
4,DemReg,2.478548
5,DemTVReg,3.754735
6,LoyalClass,3.866562
7,LoyalSpend,1.863544
8,LoyalTime,3.153694
9,TargetBuy,1.643008


In [8]:
# define our varibale of interest: 

y = dataset.iloc[:, 9].values
X = dataset.iloc[:, 0:9].values

In [9]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# we fit the model on the train set and calculate performance on the test set

In [10]:
# standardizing the variables 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# define a dictionary to store results
results = []

def compute_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    accuracy = accuracy_score(y_true, y_pred)
    total_error = 1 - accuracy
    sensitivity = recall_score(y_true, y_pred)
    specificity = TN / (TN + FP)
    fdr = FP / (FP + TP) if (FP + TP) != 0 else 0
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, total_error, FN, FP, sensitivity, specificity, fdr, precision, f1

def append_results(model_name, y_true, y_pred):
    metrics = compute_metrics(y_true, y_pred)
    results.append([model_name] + list(metrics))

# Modelling
We will fit different models to the data and then choose the top performing one to predict the output on the remaining 90% of clients.

Various performance indicators will be taken into consideration:
- Accuracy
- Total error
- False positives
- False negatives
- Specificity
- Sensitivity
- False discovery rate
- Precision
- F1 score

In [49]:
# Logistic regression
logistic = LogisticRegression(max_iter=200)
logistic.fit(X_train_scaled, y_train)
y_pred = logistic.predict(X_test_scaled)
append_results("Logistic Regression", y_test, y_pred)

In [50]:
# Lasso Regression
lasso = Lasso()
lasso.fit(X_train_scaled, y_train)
y_pred = np.round(lasso.predict(X_test_scaled))
append_results("Lasso", y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
# Ridge Regression
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)
y_pred = np.round(ridge.predict(X_test_scaled))
append_results("Ridge", y_test, y_pred)

In [52]:
# Decision Tree Classifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
append_results("Decision Tree", y_test, y_pred)

In [53]:
# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
append_results("Random Forest", y_test, y_pred)

In [54]:
# Support Vector Machine
svm = SVC(probability=True)
svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)
append_results("SVM", y_test, y_pred)

In [55]:
# Linear Discriminant Analysis
lda = LDA()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
append_results("LDA", y_test, y_pred)

In [56]:
# Quadratic Discriminant Analysis
qda = QDA()
qda.fit(X_train, y_train)
y_pred = qda.predict(X_test)
append_results("QDA", y_test, y_pred)

In [57]:
# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
append_results("KNN", y_test, y_pred)

In [12]:
# Generalized Additive Model
gam = LogisticGAM()
gam.fit(X_train_scaled, y_train)
y_pred = gam.predict(X_test_scaled) > 0.5
append_results("GAM", y_test, y_pred)

In [59]:
# Neural Network
nn = MLPClassifier()
nn.fit(X_train_scaled, y_train)
y_pred = nn.predict(X_test_scaled)
append_results("Neural Network", y_test, y_pred)



In [60]:
# Append results to a DataFrame
results_df = pd.DataFrame(results, columns=[
    "Method", "Accuracy", "Total Error", "FN", "FP", "Sensitivity", "Specificity", "FDR", "Precision", "F1 Score"
])

print(results_df)

                 Method  Accuracy  Total Error    FN   FP  Sensitivity  \
0   Logistic Regression  0.804049     0.195951   691  180     0.358998   
1                 Lasso  0.757480     0.242520  1078    0     0.000000   
2                 Ridge  0.806749     0.193251   755  104     0.299629   
3         Decision Tree  0.714286     0.285714   602  668     0.441558   
4         Random Forest  0.792576     0.207424   645  277     0.401670   
5                   SVM  0.803825     0.196175   752  120     0.302412   
6                   LDA  0.805174     0.194826   691  175     0.358998   
7                   QDA  0.804049     0.195951   548  323     0.491651   
8                   KNN  0.779978     0.220022   678  300     0.371058   
9                   GAM  0.815073     0.184927   613  209     0.431354   
10       Neural Network  0.802475     0.197525   670  208     0.378479   

    Specificity       FDR  Precision  F1 Score  
0      0.946540  0.317460   0.682540  0.470517  
1      1.0000

In this case, the model one chooses depends on the metric one chooses, so it is very important to choose the correct metric for the specific problem we are analizing.

Some important considerations are:
- Minimize false negatives: Sensitivity (Recall) is crucial here because you want to ensure you don’t miss out on potential customers who might be interested in the new product. A high sensitivity means that most of the true positives (customers who would like the product) are correctly identified
- Ensure precision: Precision is also important to avoid wasting samples on customers who are unlikely to be interested. High precision means that the customers identified as interested in the product are more likely to be genuinely interested.
- Balance Sensitivity and Precision: F1 Score provides a balance between sensitivity and precision. It is useful when you need to balance between not missing out on potential interested customers and not wasting resources on uninterested ones.
- Overall Accuracy: Accuracy is important but should be considered along with sensitivity and precision. It tells you the proportion of correctly classified customers, but it might not be as informative if your dataset is imbalanced.
-False Discovery Rate (FDR): Lower FDR means fewer of the samples you distribute will be wasted. A model with a lower FDR will help ensure that the samples are targeted more effectively.

Given the resulting metrics here are the models we should prioritize:
1. GAM: 
- Sensitivity: 0.431
- Precision: 0.690
- F1 Score: 0.531
- Accuracy: 0.815
- FDR: 0.310
GAM has the highest F1 score and good sensitivity, making it a strong candidate for balancing recall and precision.

2. Neural network:
- Sensitivity: 0.378
- Precision: 0.662
- F1 Score: 0.482
- Accuracy: 0.802
- FDR: 0.338
Neural Network performs well with a high accuracy and reasonable precision, but slightly lower sensitivity compared to GAM.

3. Random forest:
- Sensitivity: 0.402
- Precision: 0.610
- F1 Score: 0.484
- Accuracy: 0.793
- FDR: 0.390
Random Forest is also a good option, with decent performance in sensitivity and precision.

4. Quadratic Discriminant Analysis:
- Sensitivity: 0.492
- Precision: 0.621
- F1 Score: 0.549
- Accuracy: 0.804
- FDR: 0.379
QDA has the highest sensitivity and F1 score, which could be particularly useful for identifying potential customers who might be interested in the new product.

Final considerations:
- GAM (Generalized Additive Model) appears to be the best model overall in your case. It offers a good balance between sensitivity, precision, and F1 score, making it effective for identifying customers who are likely to be interested in your new product.
- QDA (Quadratic Discriminant Analysis) is also a strong contender with high sensitivity and F1 score, which makes it valuable for identifying potential customers without missing too many.
- Neural Network is a robust model but might require more tuning and computational resources compared to GAM or QDA.

I will choose GAM as ti is generally preffered over QDA when it comes to interpretability. GAMs in fact offer a more transparent view of how features contribute to the model.


In [13]:
probabilities = gam.predict_proba(X_test_scaled)
df = pd.DataFrame(X_test_scaled)  
df['actual_outcome'] = y_test
df['prob_1'] = probabilities
df['prob_0'] = 1 - probabilities

In [82]:
print(df.head())


          0         1         2         3         4         5         6  \
0  0.397576  0.339697  0.240428  0.920658  0.980466  0.766951 -1.439121   
1  0.397576 -0.207581 -0.400594 -0.662575  0.980466  0.766951 -1.439121   
2  2.191889  0.886975 -1.041616 -0.662575  0.241876  0.157286 -1.439121   
3 -1.097685  0.496062  0.240428 -0.662575  0.980466 -0.757212  0.269943   
4  0.098523 -0.129398  0.240428 -0.662575 -1.235304 -0.452379  0.269943   

          7         8  actual_outcome    prob_1    prob_0  
0  1.060786 -0.553946               0  0.101506  0.898494  
1  0.202419 -0.123501               0  0.267650  0.732350  
2  0.209323  0.091721               0  0.562185  0.437815  
3  0.073003 -1.199614               0  0.087344  0.912656  
4 -0.121120 -0.123501               0  0.214226  0.785774  


In [83]:
df.to_excel("./data/ModelOutput_10Percent.xlsx")

In [14]:
# Exporting GAM Classifier to later use in prediction
import joblib
joblib.dump(gam, 'Classifier')

['Classifier_LoyalCustomers']