# Task: Marketing Campaign

Source of the data and task: https://www.kaggle.com/datasets/rodsaldanha/arketing-campaign

"A response model can provide a significant boost to the efficiency of a marketing campaign by increasing responses or reducing expenses. The objective is to predict who will respond to an offer for a product or service."

# Imports

In [1]:
import pandas as pd
import seaborn as sns

import category_encoders as ce

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold

# Reading and processing data

In [2]:
data = pd.read_csv("../data/marketing_campaign.csv", sep=';', index_col=0)

In [3]:
data["Dt_Customer"].max()

'2014-06-29'

In [4]:
CostContact = data["Z_CostContact"].drop_duplicates().values[0]
Revenue = data["Z_Revenue"].drop_duplicates().values[0]

In [5]:
print(CostContact, Revenue)

3 11


In [6]:
data["Age"] = 2014 - data["Year_Birth"]
data = data.drop(columns = ["Year_Birth"])
data["Dt_Customer"] =  pd.to_datetime(data['Dt_Customer'], format='%Y-%m-%d')
data = data.drop(columns = ['Z_CostContact', 'Z_Revenue'])

In [7]:
basedate = pd.Timestamp('2014-06-30')
data['DaysSinceJoin'] = data["Dt_Customer"].apply(lambda x: (basedate - x).days)
data = data.drop(columns=["Dt_Customer"])

In [8]:
data.head()

Unnamed: 0_level_0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Age,DaysSinceJoin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5524,Graduation,Single,58138.0,0,0,58,635,88,546,172,...,7,0,0,0,0,0,0,1,57,664
2174,Graduation,Single,46344.0,1,1,38,11,1,6,2,...,5,0,0,0,0,0,0,0,60,114
4141,Graduation,Together,71613.0,0,0,26,426,49,127,111,...,4,0,0,0,0,0,0,0,49,313
6182,Graduation,Together,26646.0,1,0,26,11,4,20,10,...,6,0,0,0,0,0,0,0,30,140
5324,PhD,Married,58293.0,1,0,94,173,43,118,46,...,5,0,0,0,0,0,0,0,33,162


In [9]:
target = data['Response']
data = data.drop(columns=["Response"])

# Encoding category features

In [10]:
data["Education"].drop_duplicates()

ID
5524    Graduation
5324           PhD
7446        Master
387          Basic
2278      2n Cycle
Name: Education, dtype: object

In [11]:
encoder = ce.OrdinalEncoder(cols=["Education"], mapping=[{"col": "Education", 
                                                          "mapping":{None: 0, "Basic": 1, "2n Cycle": 2,
                                                                     "Graduation":3, "Master": 4,"PhD": 5}}],
                           return_df=True)
encoder 
encoder.fit(data, target)

In [12]:
data_cleaned = encoder.transform(data)

In [13]:
data["Marital_Status"].drop_duplicates()

ID
5524      Single
4141    Together
5324     Married
965     Divorced
8595       Widow
433        Alone
7734      Absurd
492         YOLO
Name: Marital_Status, dtype: object

In [14]:
data.loc[data["Marital_Status"] == "Alone", "Marital_Status"] = "Single"
data.loc[data["Marital_Status"] == "Together", "Marital_Status"] = "Married"
data.loc[data["Marital_Status"] == "Absurd", "Marital_Status"] = "Other"
data.loc[data["Marital_Status"] == "YOLO", "Marital_Status"] = "Other"

In [15]:
data["Marital_Status"].drop_duplicates()

ID
5524      Single
4141     Married
965     Divorced
8595       Widow
7734       Other
Name: Marital_Status, dtype: object

In [16]:
encoder = ce.OneHotEncoder(cols=["Marital_Status"], use_cat_names=True)
encoder 
encoder.fit(data, target)

In [17]:
data_cleaned_2 = encoder.transform(data_cleaned)

In [18]:
data_cleaned_2 = data_cleaned_2.drop(columns="Marital_Status_Other")

In [19]:
data_cleaned_2.head()

Unnamed: 0_level_0,Education,Marital_Status_Single,Marital_Status_Married,Marital_Status_Divorced,Marital_Status_Widow,Income,Kidhome,Teenhome,Recency,MntWines,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Age,DaysSinceJoin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5524,3,1,0,0,0,58138.0,0,0,58,635,...,4,7,0,0,0,0,0,0,57,664
2174,3,1,0,0,0,46344.0,1,1,38,11,...,2,5,0,0,0,0,0,0,60,114
4141,3,0,0,0,0,71613.0,0,0,26,426,...,10,4,0,0,0,0,0,0,49,313
6182,3,0,0,0,0,26646.0,1,0,26,11,...,4,6,0,0,0,0,0,0,30,140
5324,5,0,1,0,0,58293.0,1,0,94,173,...,6,5,0,0,0,0,0,0,33,162


# Training and evaluating ML model (XGBoost)

In [20]:
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)

n = rskf.get_n_splits(data_cleaned_2, target)
acc_list = []
for i, (train_index, test_index) in enumerate(rskf.split(data_cleaned_2, target)):
    X_train = data_cleaned_2.iloc[train_index]
    Y_train = target.iloc[train_index]
    train = xgb.DMatrix(X_train, label=Y_train)
    
    X_test = data_cleaned_2.iloc[test_index]
    Y_test = target.iloc[test_index]    
    test = xgb.DMatrix(X_test, label=Y_test)
    
    param = {
        'max_depth': 1,
        'eta': 0.1,
        'objective': 'multi:softmax',
        'num_class': 3} 
    epochs = 1
    
    model = xgb.train(param, train, epochs)
    predictions = model.predict(test)
    acc_list.append(accuracy_score(Y_test, predictions))
acc_list
print("Mean accuracy of the 10 repeated 10 fold CV:", sum(acc_list)/n)

Mean accuracy of the 10 repeated 10 fold CV: 0.860267857142857
