We will now predict the buying probabilities for the remaining 90% customers

# Load data and libraries

In [2]:
import numpy as np
import pandas as pd

In [3]:
file_path = "./data/Dataset_90Percent.xlsx"
dataset = pd.read_excel(file_path)
print(dataset.head())

     ID  DemAffl  DemAge DemClusterGroup DemGender    DemReg      DemTVReg  \
0   140     10.0    76.0               C         U  Midlands  Wales & West   
1   620      4.0    49.0               D         U  Midlands  Wales & West   
2   868      5.0    70.0               D         F  Midlands  Wales & West   
3  1120     10.0    65.0               F         M  Midlands      Midlands   
4  2313     11.0    68.0               A         F  Midlands      Midlands   

  LoyalClass  LoyalSpend  LoyalTime  
0       Gold    16000.00        4.0  
1       Gold     6000.00        5.0  
2     Silver        0.02        8.0  
3        Tin        0.01        7.0  
4        Tin        0.01        8.0  


# Prepare dataset for prediction

In [4]:
# filling missing values with mean/mode*
dataset['DemAffl']=dataset['DemAffl'].fillna(dataset['DemAffl'].mode()[0])
dataset['DemAge']=dataset['DemAge'].fillna(dataset['DemAge'].mode()[0])
dataset['DemClusterGroup']=dataset['DemClusterGroup'].fillna(dataset['DemClusterGroup'].mode()[0])
dataset['DemGender']=dataset['DemGender'].fillna(dataset['DemGender'].mode()[0])
dataset['DemReg']=dataset['DemReg'].fillna(dataset['DemReg'].mode()[0])
dataset['DemTVReg']=dataset['DemTVReg'].fillna(dataset['DemTVReg'].mode()[0])
dataset['LoyalTime']=dataset['LoyalTime'].fillna(dataset['LoyalTime'].mean())
dataset.isna().sum()

In [5]:
# converting to mumeric
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()

dataset['DemClusterGroup'] = number.fit_transform(dataset['DemClusterGroup'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemGender'] = number.fit_transform(dataset['DemGender'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemReg'] = number.fit_transform(dataset['DemReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemTVReg'] = number.fit_transform(dataset['DemTVReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['LoyalClass'] = number.fit_transform(dataset['LoyalClass'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'U': 6}
{'F': 0, 'M': 1, 'U': 2}
{'Midlands': 0, 'North': 1, 'Scottish': 2, 'South East': 3, 'South West': 4}
{'Border': 0, 'C Scotland': 1, 'East': 2, 'London': 3, 'Midlands': 4, 'N East': 5, 'N Scot': 6, 'N West': 7, 'S & S East': 8, 'S West': 9, 'Ulster': 10, 'Wales & West': 11, 'Yorkshire': 12}
{'Gold': 0, 'Platinum': 1, 'Silver': 2, 'Tin': 3}


In [6]:
dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime
0,140,10.0,76.0,2,2,0,11,0,16000.0,4.0
1,620,4.0,49.0,3,2,0,11,0,6000.0,5.0
2,868,5.0,70.0,3,0,0,11,2,0.02,8.0
3,1120,10.0,65.0,5,1,0,4,3,0.01,7.0
4,2313,11.0,68.0,0,0,0,4,3,0.01,8.0


# Prediction

In [7]:
X_fresh = dataset.iloc[:, 1:10].values

import joblib
classifier = joblib.load('Classifier')

In [21]:
predictions = classifier.predict_proba(X_fresh)
predictions

array([0.00000000e+000, 0.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 0.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       2.07588735e-181, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       1.37760102e-006, 1.00000000e+000, 1.00000000e+000, 1.45983694e-122,
       0.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       6.53485916e-024, 0.00000000e+000, 0.00000000e+000, 1.93177094e-077,
       1.00000000e+000, 1.00000000e+000, 0.00000000e+000, 1.47381159e-110,
       1.00000000e+000, 0.00000000e+000, 1.87850839e-086, 1.00000000e+000,
       0.00000000e+000, 1.00000000e+000, 0.00000000e+000, 1.00000000e+000,
       3.56903745e-139, 8.06231596e-124, 2.48498672e-083, 1.00000000e+000,
       0.00000000e+000, 1.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       1.00000000e+000, 0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
       2.74592262e-073, 1.13317424e-075, 0.00000000e+000, 3.45544202e-213,
       0.00000000e+000, 0

In [22]:
df = dataset 
df['prob_1'] = predictions
df['prob_0'] = 1 - predictions

In [20]:
# writing model output file
df.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,prob_1,prob_0
0,140,10.0,76.0,2,2,0,11,0,16000.0,4.0,0.0,1.0
1,620,4.0,49.0,3,2,0,11,0,6000.0,5.0,0.0,1.0
2,868,5.0,70.0,3,0,0,11,2,0.02,8.0,1.0,0.0
3,1120,10.0,65.0,5,1,0,4,3,0.01,7.0,1.0,0.0
4,2313,11.0,68.0,0,0,0,4,3,0.01,8.0,1.0,0.0


In [23]:
df.to_excel("BuyProb_90Percent.xlsx")