In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd


!pip install sklearn
from sklearn.model_selection import  StratifiedKFold
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from nltk.classify.scikitlearn import SklearnClassifier
import random



path="/content/drive/My Drive/term-deposit-marketing-2020.csv"
df = pd.read_csv(path)




In [2]:
header = list(df.head(0))
data = df.drop([0])

Categorical features are transformed into numerical values (unique integers) in order to be able to transform the data into the form of a numpy array.

In [3]:
features = {}
for i in range(len(header)):
  features[i]= header[i]
#all features = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'y']
numeric_values = ['age', 'balance',  'day', 'duration', 'campaign']
binary = ['default', 'housing', 'loan','y']
samples = np.zeros(data.shape,dtype = np.int32)
categories = {}
 
for i in range(len(header)):
  num = {}
  if features[i] not in numeric_values:
    if features[i] in binary:
      num['yes'] = 1
      num['no'] = 0
    elif features[i] == 'month':
      num = { 'jan': 1, 'feb': 2, 'mar': 3,'apr': 4, 'may': 5,'jun': 6,   'jul': 7, 'aug': 8, 'sep':9, 'oct': 10, 'nov': 11, 'dec': 12}
    else:
      values = list(set(data[features[i]]))
      
      for j in range(len(values)):
        num[values[j]]= j
    categories[i] = num




['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'y']


In [4]:
#data is tranfsformed into its numerical equivalent
for i in range(len(header)):
  feat = list(data[features[i]])
  try:
    dct = categories[i]
    for j in range(len(feat)):
      samples[j][i] = dct[feat[j]]

  except KeyError:
    for j in range(len(feat)):
      samples[j][i] = feat[j]

np.random.shuffle(samples)
train = samples[:36000]
test = samples[36000:]


In [5]:
labels = train[:,-1]
smp = train[:,:-1]

In [6]:
#various classifiers are evaluated before choosing the top 5
# following the evalation score will be calculated as the average score for all classifiers
cls_dict = {}
sgd = SGDClassifier()
cls_dict['sgd'] = sgd
bernb = BernoulliNB()
cls_dict['bernb'] = bernb
#gaussnb = GaussianNB()
#cls_dict['gaussnb'] = gaussnb
knn = KNeighborsClassifier()
cls_dict['knn'] = knn 
svc = SVC()
cls_dict['svc'] = svc 
#dt =  DecisionTreeClassifier(max_depth=6)
#cls_dict['dt'] = dt 
#rf =  RandomForestClassifier(max_depth=6)
#cls_dict['rf'] = rf
nn = MLPClassifier(max_iter=1000)
cls_dict['nn'] = nn 

classifiers = list(cls_dict.values())

def train_and_test(clsifiers,xtrain,ytrain,xtest,ytest):
  n =len(clsifiers)
  scores = np.zeros(n)
  for i in range(n):
    c = clsifiers[i]
    c.fit(xtrain, ytrain)
    score = c.score(xtest, ytest)
    scores[i] = score
  return np.average(scores)


In [17]:
#Average accuracy is calculated through cross validation. Due to the fact that 
#we are dealing with an imbalanced dataset StratifiedKFold is used to ensure 
#that the original ratio of the two classes is maintained for each fold.

folds = 5
skf = StratifiedKFold(n_splits=folds)
k=0
results = np.zeros(folds)
for i,j in skf.split(smp,labels):
  train_X, test_X = smp[i], smp[j]
  train_y, test_y = labels[i], labels[j]
  results[k] = train_and_test(classifiers,train_X,train_y,test_X, test_y)
  k+=1

accuracy = np.average(results)

In [8]:
#final evaluation on test data
test_labels = test[:,-1]
test_smp = test[:,:-1]
def predict(clf,x):  
  x=x.reshape(1, -1)
  s = 0 
  for c in clf:
    y = c.predict(x)
    s+= y
  p = np.round(s/len(clf))
  if p == 1:
    confidence = s/len(clf)
  elif p == 0:
    confidence = 1 -s/len(clf)
  return p, confidence

#evaluating models
s = 0
for i in range(len(test_smp)):
  p,c = predict(classifiers,test_smp[i])
  if p==test_labels[i]:
    s+=1

test_acc = s/len(test_smp)
print(test_acc)

In [16]:
# customers most probable to buy an investment product
# Confidence is based on the number of classifiers that agree will the final resut.
# Since for a class to be selected it must have the vote of at least 3 classifiers and it can have at most 5 votes (all classifiers), therefore  confidence varies between 0.6 to 1.0.
# After the customers that are probable to buy an investment are identified, they are sorted according to the confidence in classification.

approved = {}
for i in range(len(test_smp)):
  p,c = predict(classifiers,test_smp[i])
  if p == 1:
    approved[i] = c

final = sorted(approved.items(), key=lambda kv: kv[1],reverse = True)
top = final[:20]
for t in top:
  print('Index:\t'+ str(t[0]) + '\n\tProbability:\t'+ str(t[1][0])+'\n')


Index:	1679
	Probability:	0.8

Index:	17
	Probability:	0.6

Index:	132
	Probability:	0.6

Index:	137
	Probability:	0.6

Index:	448
	Probability:	0.6

Index:	556
	Probability:	0.6

Index:	561
	Probability:	0.6

Index:	630
	Probability:	0.6

Index:	734
	Probability:	0.6

Index:	752
	Probability:	0.6

Index:	876
	Probability:	0.6

Index:	1145
	Probability:	0.6

Index:	1282
	Probability:	0.6

Index:	1291
	Probability:	0.6

Index:	1334
	Probability:	0.6

Index:	1359
	Probability:	0.6

Index:	1409
	Probability:	0.6

Index:	1413
	Probability:	0.6

Index:	1513
	Probability:	0.6

Index:	1556
	Probability:	0.6



In [12]:
#To find the most relevant features decision trees are used. The higher the score the more important is a feature. 
#The trees are built several times to obtain a more accurate score based on the average of all the runs.
#In the end features are sorted according to their score.
dtclf = ExtraTreesClassifier(n_estimators=10)
n_runs = 10
pts = np.zeros(smp.shape[1])  #number of features
for i in range(n_runs):
  dtclf.fit(test_smp, test_labels)
  pts += dtclf.feature_importances_

pts = np.divide(pts,n_runs)


feature_importance = {}
for h in range(len(header)-1):
  feature_importance[header[h]] = pts[h]

sorted_feature_importance = sorted(feature_importance.items(), key=lambda kv: kv[1],reverse = True)
for tup in sorted_feature_importance:
  print('Feature:\t'+ tup[0] + '\n\tScore:\t'+ str(tup[1])+'\n')

Feature:	duration
	Score:	0.34207190820997674

Feature:	age
	Score:	0.10173567687024625

Feature:	day
	Score:	0.09866619286825086

Feature:	balance
	Score:	0.09772042328262333

Feature:	month
	Score:	0.08813337294846087

Feature:	job
	Score:	0.06661890238749578

Feature:	campaign
	Score:	0.06458157220617229

Feature:	education
	Score:	0.040702984266063336

Feature:	marital
	Score:	0.03334754486450181

Feature:	contact
	Score:	0.024829719983953374

Feature:	housing
	Score:	0.02018343795028625

Feature:	loan
	Score:	0.015506264606576156

Feature:	default
	Score:	0.005901999555392961

