In [1]:
import os
# Install java
! sudo apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

os.environ["SPARK_HOME"] = "/opt/conda/lib/python3.7/site-packages/pyspark/"
os.environ["PATH"] = os.environ["SPARK_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["geospark.global.charset"]="utf8"

In [26]:
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

def start():
    builder = SparkSession.builder \
        .appName("Spark Data Processor") \
        .master("local[*]") \
        .config("spark.driver.memory", "22G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
        .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    return builder.getOrCreate()
spark = start()
spark.version

'2.4.5'

In [27]:
rawDataDF = spark.read.format('csv').options(header='true').load("gs://marketing-campaigns/bank-full.csv")

In [18]:
rawDataDF.head(1)

[Row("age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"='58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"')]

In [56]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import pandas as pd

dataFile="gs://marketing-campaigns/bank-full.csv"
RANDOM_STATE=545510477

In [59]:
def loadTrainingData(dataFile):
    pdBankDF = pd.read_csv(dataFile,sep=";")
    pdBankDF.head(2)
    return pdBankDF

In [60]:
def buildPreprocessor(pdBankDF):
    scaler = RobustScaler()
    pdBankDF['age_scaled'] = scaler.fit_transform(pdBankDF['age'].values.reshape(-1,1))
    pdBankDF['bal_scaled'] = scaler.fit_transform(pdBankDF['balance'].values.reshape(-1,1))
    pdBankDF['dur_caled'] = scaler.fit_transform(pdBankDF['duration'].values.reshape(-1,1))
    pdBankDF.drop(['age','balance','duration'], axis=1, inplace=True)
    pdBankDF.head(2)
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    numeric_features = pdBankDF.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = pdBankDF.select_dtypes(include=['object']).columns

    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                   ('cat', categorical_transformer, categorical_features)])
    return preprocessor

In [61]:
def splitTrainingData(pdBankDF):
    Y = pdBankDF.pop('y')
    X_train, X_test, y_train, y_test = train_test_split(pdBankDF, Y, test_size=0.3, random_state=123)
    return X_train, X_test, y_train, y_test

def buildAndRunLogRegModel(X_train, X_test, y_train, y_test, preprocessor):
    lrModel = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',LogisticRegression(max_iter=1000))])
    lrModel.fit(X_train, y_train)  
    print("model score: %.2f" % lrModel.score(X_train, y_train))
    return lrModel
    
def buildAndRunSVMModel(X_train, X_test, y_train, y_test, preprocessor):
    svmModel = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',LinearSVC(max_iter=5000))])
    svmModel.fit(X_train, y_train)  
    print("model score: %.2f" % svmModel.score(X_train, y_train))
    return svmModel
    
def buildAndRunDTreeModel(X_train, X_test, y_train, y_test, preprocessor):
    dtreeModel = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',DecisionTreeClassifier(max_depth=5))])
    dtreeModel.fit(X_train, y_train) 
    print("model score: %.2f" % dtreeModel.score(X_train, y_train))
    return dtreeModel


In [62]:
def predictAndMeasurePerformence(X_test, userSignUpPredModel):
    pred = userSignUpPredModel.predict(X_test)
    print(classification_report(pred,y_test))
    # Confusion Matrix for the model
    
    confusionMatrix = confusion_matrix(y_test, pred)
    print(confusionMatrix)
    #print('Percentage of negative class :',(y_train[y_train=='yes'].value_counts()/len(y_train) ) * 100)
    #print('Percentage of positive class :',(y_train[y_train=='no'].value_counts()/len(y_train) ) * 100)

In [None]:
bankDF = loadTrainingData(dataFile)
updatedBankDF, preprocessor = buildPreprocessor(bankDF)


In [None]:
X_train, X_test, y_train, y_test = splitTrainingData(updatedBankDF)

In [None]:
lrModel = buildAndRunLogRegModel(X_train, X_test, y_train, y_test, preprocessor)
predictAndMeasurePerformence(X_test,lrModel)

In [None]:
svmModel = buildAndRunSVMModel(X_train, X_test, y_train, y_test, preprocessor)
predictAndMeasurePerformence(X_test,svmModel)

In [None]:
dtreeModel = buildAndRunDTreeModel(X_train, X_test, y_train, y_test, preprocessor)
predictAndMeasurePerformence(X_test,dtreeModel)

In [88]:
def buildAndRunLogRegModelV2(RANDOM_STATE, X_train, X_test, y_train, y_test, preprocessor):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    import numpy as np
    import matplotlib.pyplot as plt
    from IPython.display import display
    
    lrPipe = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',
                            LogisticRegression(C=0.0001,dual=False,tol=0.001,
                                                            fit_intercept=True,intercept_scaling=1.0,penalty='l2',solver='lbfgs',random_state=RANDOM_STATE,max_iter=1000)
                           )])
    #param_grid = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    param_grid = [
        {'classifier' : [LogisticRegression()],
         'classifier__penalty' : ['l1', 'l2'],
        'classifier__C' : np.logspace(-4, 4, 20),
        'classifier__solver' : ['liblinear','rbf']}]

    clf = GridSearchCV(lrPipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

    #Fit on data

    best_clf = clf.fit(X_train, y_train)

    print("model score: %.2f" % best_clf.score(X_train, y_train))
    return best_clf

In [89]:
dtreeModelV2 = buildAndRunLogRegModelV2(RANDOM_STATE,X_train, X_test, y_train, y_test, preprocessor)
predictAndMeasurePerformence(X_test,dtreeModelV2)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   22.0s finished


model score: 0.90
              precision    recall  f1-score   support

          no       0.98      0.92      0.95     12773
         yes       0.32      0.64      0.43       791

    accuracy                           0.90     13564
   macro avg       0.65      0.78      0.69     13564
weighted avg       0.94      0.90      0.92     13564

[[11710   288]
 [ 1063   503]]


In [96]:
def buildAndRunRandomForestModel(RANDOM_STATE, X_train, X_test, y_train, y_test, preprocessor):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    import numpy as np
    import matplotlib.pyplot as plt
    from IPython.display import display
    
    rfPipe = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',RandomForestClassifier())])
    #param_grid = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    param_grid = [{'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}]

    clf = GridSearchCV(rfPipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

    #Fit on data

    best_clf = clf.fit(X_train, y_train)

    print("model score: %.2f" % best_clf.score(X_train, y_train))
    return best_clf

In [97]:
rfModel = buildAndRunRandomForestModel(RANDOM_STATE,X_train, X_test, y_train, y_test, preprocessor)
predictAndMeasurePerformence(X_test,rfModel)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.0min finished


model score: 0.90
              precision    recall  f1-score   support

          no       0.96      0.93      0.95     12397
         yes       0.46      0.62      0.53      1167

    accuracy                           0.90     13564
   macro avg       0.71      0.77      0.74     13564
weighted avg       0.92      0.90      0.91     13564

[[11549   449]
 [  848   718]]


In [None]:
!pip install smote-variants

In [31]:
def preProcessDataV2(pdBankDF):
    from sklearn import preprocessing
    #le = preprocessing.LabelEncoder()
    #le.fit(['job','marital','education','default','housing','loan','contact','month','poutcome'])
    
    scaler = RobustScaler()
    pdBankDF['age_scaled'] = scaler.fit_transform(pdBankDF['age'].values.reshape(-1,1))
    pdBankDF['bal_scaled'] = scaler.fit_transform(pdBankDF['balance'].values.reshape(-1,1))
    pdBankDF['dur_caled'] = scaler.fit_transform(pdBankDF['duration'].values.reshape(-1,1))
    pdBankDF.drop(['age','balance','duration'], axis=1, inplace=True)
    bankCat = pd.get_dummies(pdBankDF[['job','marital','education','default','housing','loan','contact','month','poutcome']])
    bankNum = pdBankDF[['age_scaled','bal_scaled','day','dur_caled','campaign','pdays','previous']]
    X = pd.concat([bankCat, bankNum], axis=1)
    Y = pdBankDF['y']
    return X, Y

In [109]:
from sklearn import preprocessing
pdBankDF = loadTrainingData(dataFile)
X, Y = preProcessDataV2(pdBankDF)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

print("Before OverSampling count of yes: {}".format(sum(y_train=='yes')))
print("Before OverSampling count of no: {} \n".format(sum(y_train=='no')))

Before OverSampling count of yes: 3723
Before OverSampling count of no: 27924 



In [110]:
import smote_variants as sv
import numpy as np
oversampler= sv.SMOTE()
X_train_balanced, y_train_balanced = oversampler.sample(np.array(X_train), np.array(y_train))

print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_balanced=='yes')))
print("After OverSampling, counts of label 'no': {}".format(sum(y_train_balanced=='no')))

2020-10-25 06:02:47,105:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


After OverSampling, counts of label 'Yes': 27924
After OverSampling, counts of label 'no': 27924


In [112]:
bankModel2 = LogisticRegression()
bankModel2.fit(X_train_balanced, y_train_balanced)
pred = bankModel2.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set for Smote balanced data set: {:.2f}'.format(bankModel2.score(X_test, y_test)))

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)
print(classification_report(y_test, pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy of Logistic regression model prediction on test set for Smote balanced data set: 0.85
[[10225  1773]
 [  329  1237]]
              precision    recall  f1-score   support

          no       0.97      0.85      0.91     11998
         yes       0.41      0.79      0.54      1566

    accuracy                           0.85     13564
   macro avg       0.69      0.82      0.72     13564
weighted avg       0.90      0.85      0.86     13564



In [48]:
def evaluateModelLRV2(X_train_balanced, y_train_balanced, X_test, y_test):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    import numpy as np

    lrPipe = Pipeline(steps=[('classifier',LogisticRegression())])
    param_grid = [{'classifier' : [LogisticRegression()],
         'classifier__penalty' : ['l1', 'l2'],
        'classifier__C' : np.logspace(-4, 4, 20),
        'classifier__solver' : ['liblinear','rbf']}]

    clf = GridSearchCV(lrPipe,param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

    #Fit on data

    best_clf = clf.fit(X_train_balanced, y_train_balanced)

    print("model score: %.2f" % best_clf.score(X_train_balanced, y_train_balanced))

    pred = best_clf.predict(X_test)
    confusionMatrix = confusion_matrix(y_test, pred)
    print(confusionMatrix)
    print(classification_report(y_test, pred))


In [51]:
def evaluateRFModelV2(X_train_balanced, y_train_balanced, X_test, y_test):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    import numpy as np

    rfPipe = Pipeline(steps=[('classifier',RandomForestClassifier())])
    
    param_grid = [{'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}]

    clf = GridSearchCV(rfPipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

    #Fit on data

    best_clf = clf.fit(X_train_balanced, y_train_balanced)

    print("model score: %.2f" % best_clf.score(X_train_balanced, y_train_balanced))

    pred = best_clf.predict(X_test)
    confusionMatrix = confusion_matrix(y_test, pred)
    print(confusionMatrix)
    print(classification_report(y_test, pred))

In [121]:
evaluateModelLRV2(X_train_balanced, y_train_balanced, X_test, y_test)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   25.9s finished


model score: 0.85
[[10258  1740]
 [  315  1251]]
              precision    recall  f1-score   support

          no       0.97      0.85      0.91     11998
         yes       0.42      0.80      0.55      1566

    accuracy                           0.85     13564
   macro avg       0.69      0.83      0.73     13564
weighted avg       0.91      0.85      0.87     13564



In [None]:
evaluateRFModelV2(X_train_balanced, y_train_balanced, X_test, y_test)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.9min


In [8]:
pdBankDF = loadTrainingData(dataFile)
X, Y = preProcessData(pdBankDF)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

trainData = pd.concat([X_train,y_train],axis=1)

# Finding the indexes of the sample data set where the propensity is 'yes'
ind = trainData[trainData['y']=='yes'].index
print(len(ind))

# Seperate the minority classes
minData = trainData.loc[ind]
print(minData.shape)

# Finding indexes of majority class
ind1 = trainData[trainData['y']=='no'].index
print(len(ind1))
# Seperating the majority class
majData = trainData.loc[ind1]
print(majData.shape)
majData.head()

3723
(3723, 52)
27924
(27924, 52)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,age_scaled,bal_scaled,day,dur_caled,campaign,pdays,previous,y
19100,1,0,0,0,0,0,0,0,0,0,...,0,1,0.8,-0.162979,5,0.236111,1,-1,0,no
37958,1,0,0,0,0,0,0,0,0,0,...,0,0,0.733333,-0.238938,14,0.865741,2,289,19,no
12451,0,1,0,0,0,0,0,0,0,0,...,0,1,0.0,0.385693,1,1.347222,3,-1,0,no
18263,0,0,0,0,1,0,0,0,0,0,...,0,1,1.333333,-0.330383,31,-0.592593,8,-1,0,no
5128,0,0,0,0,0,0,0,1,0,0,...,0,1,-0.466667,-0.14233,21,-0.435185,2,-1,0,no


In [9]:
majSample = majData.sample(n=len(ind),random_state = 123)
print(majSample.shape)
majSample.head()

(3723, 52)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,age_scaled,bal_scaled,day,dur_caled,campaign,pdays,previous,y
17387,0,0,0,0,1,0,0,0,0,0,...,0,1,0.666667,0.752212,28,-0.425926,3,-1,0,no
34679,0,1,0,0,0,0,0,0,0,0,...,0,0,0.8,0.086283,5,-0.106481,7,250,3,no
26572,1,0,0,0,0,0,0,0,0,0,...,0,1,0.466667,1.785398,20,-0.134259,2,-1,0,no
3280,0,0,0,0,0,1,0,0,0,0,...,0,1,1.2,1.972714,15,-0.009259,1,-1,0,no
4434,0,0,0,0,1,0,0,0,0,0,...,0,1,-0.133333,2.011062,20,-0.055556,1,-1,0,no


In [10]:

balData = pd.concat([minData,majSample],axis = 0)
print('balanced data set',balData.shape)

from sklearn.utils import shuffle

balData = shuffle(balData)
balData.head()

balanced data set shape (7446, 52)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,age_scaled,bal_scaled,day,dur_caled,campaign,pdays,previous,y
9320,0,0,0,0,0,0,0,0,0,1,...,0,1,1.266667,-0.512537,6,-0.00463,1,-1,0,no
8266,1,0,0,0,0,0,0,0,0,0,...,0,1,-0.4,-0.330383,2,-0.685185,2,-1,0,no
31153,0,0,0,0,1,0,0,0,0,0,...,0,1,-0.733333,2.332596,26,-0.180556,2,-1,0,yes
33933,0,1,0,0,0,0,0,0,0,0,...,0,0,0.2,-0.182153,30,3.060185,1,85,2,yes
41122,0,0,0,0,1,0,0,0,0,0,...,0,0,-0.866667,0.280236,17,-0.111111,1,199,1,yes


In [11]:
X_trainNew = balData.iloc[:,0:51]
X_trainNew.head()

y_trainNew = balData['y']
y_trainNew.head()

9320      no
8266      no
31153    yes
33933    yes
41122    yes
Name: y, dtype: object

In [19]:
evaluateModelLRV2(X_trainNew, y_trainNew, X_test, y_test)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    4.1s finished


model score: 0.84
[[10201  1797]
 [  298  1268]]
              precision    recall  f1-score   support

          no       0.97      0.85      0.91     11998
         yes       0.41      0.81      0.55      1566

    accuracy                           0.85     13564
   macro avg       0.69      0.83      0.73     13564
weighted avg       0.91      0.85      0.87     13564



In [20]:
evaluateRFModelV2(X_trainNew, y_trainNew, X_test, y_test)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   47.9s finished


model score: 1.00
[[9981 2017]
 [ 187 1379]]
              precision    recall  f1-score   support

          no       0.98      0.83      0.90     11998
         yes       0.41      0.88      0.56      1566

    accuracy                           0.84     13564
   macro avg       0.69      0.86      0.73     13564
weighted avg       0.92      0.84      0.86     13564



In [24]:
#!pip install graphviz
import matplotlib.pyplot as plt
import graphviz
from sklearn import tree

dec_tree = tree.DecisionTreeClassifier(max_depth= 5, class_weight="balanced", random_state = 1234)
dec_tree.fit(X_train,y_train)

dot_data = tree.export_graphviz(dec_tree, out_file=None, feature_names=list(X_train.columns.values),
                                     class_names=["No", "Yes"], filled=True, rounded=True, special_characters=True)

graph = graphviz.Source(dot_data)
graph.format = 'png'
graph.render("Decision-Tree-full", view=False)


'Decision-Tree-full.png'

In [64]:
#Further fine-tune the model and select only required elements

X_train, X_test, y_train, y_test = splitTrainingData(updatedBankDF)

cols = ['job_entrepreneur','job_housemaid',
       'job_management', 'job_retired', 'job_services',
        'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown','age_scaled', 'bal_scaled', 'day','dur_caled', 'campaign', ]

X, Y = preProcessDataV2(pdBankDF)
X_selected = X[cols]
X_train_trimmed, X_test, y_train_trimmed, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

dtreeModelV2 = buildAndRunLogRegModelV2(RANDOM_STATE,X_train, X_test, y_train, y_test, preprocessor)
predictAndMeasurePerformence(X_test,dtreeModelV2)

NameError: name 'updatedBankDF' is not defined