## Import Libraries

In [1]:
!pip install liac-arff



In [2]:
# !pip install -U scikit-learn scipy matplotlib

In [3]:
# !pip install requests

In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import arff
import requests

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

## Import Data

In [5]:
# get data from online
training_arff = requests.get('https://raw.githubusercontent.com/juwon0502/MIS-373-Predictive-Analytics/master/datasets/bank-training.arff')
testing_arff = requests.get('https://raw.githubusercontent.com/juwon0502/MIS-373-Predictive-Analytics/master/datasets/bank-NewCustomers.arff')

# read as arff file
training_arff = arff.load(training_arff.text)
testing_arff = arff.load(testing_arff.text)
col_val = [attribute[0] for attribute in training_arff['attributes']]

# transform arff file into pandas dataframe
training_df = pd.DataFrame(training_arff['data'], columns = col_val)
testing_df = pd.DataFrame(testing_arff['data'], columns = col_val)
meta = training_arff['attributes']

def clean_df(df):
  cols = list(df.columns)
  for col in cols:
    try:
      df = df.replace({col: {'YES': True, 'NO': False}})
    except:
      pass
    pass
  return df

training_df = clean_df(training_df)
training_df_dummy = pd.get_dummies(training_df)
testing_df_dummy = pd.get_dummies(clean_df(testing_df))

## Sample Model

In [6]:
X = training_df_dummy.drop(columns = ['pep'])
y = training_df_dummy.pep
model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5).fit(X, y)

Filter method

In [7]:
#Libraries for feature ranking based on filter methods
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
X = training_df_dummy.drop(columns = ['pep'])
y = training_df_dummy.pep

# Use mutual information to rank the features
selector = SelectKBest(mutual_info_classif)
selector.fit(X, y)
scores = selector.scores_

# Print the scores for each feature
for i, score in enumerate(selector.scores_):
    print("Feature %d: %f" % (i, score))

# Get the indices of the selected features
selected_features_indices = selector.get_support(indices=True)

# Get the names of all features
# feature_names = X.feature_names
feature_names=list(X.columns)

# Create a dictionary that maps feature names to their scores
score_dict = dict(zip(feature_names, scores))


#To Test: dropping only one and put it back
#Sort the dictionary by scores in descending order
sorted_dict = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
print (sorted_dict)
model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5,random_state=0).fit(X, y)
mycv = cross_val_score(model, X, y, cv = 10)
print('CV with all attributes:', mycv.mean())

print('*********************')
for x in sorted_dict:
    #print (x[0],x[1])
    #Drop the feature we are analyzing
    X1 = X.drop(columns = [x[0]])

    #Fit the model without the feature
    model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5,random_state=0).fit(X1, y)
    mycv = cross_val_score(model, X1, y, cv = 10)
    #We can see dropping which features gives higher accuracy than when have all features, 0.843
    print(f'CV without {x[0]} is {mycv.mean()}')


# #To Test: dropping from the least important to the most important
# #Sort the dictionary by scores in ascending order
# sorted_dict = sorted(score_dict.items(), key=lambda x: x[1], reverse=False)
# print (sorted_dict)
# model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5,random_state=0).fit(X, y)
# mycv = cross_val_score(model, X, y, cv = 10)
# print('CV with all attributes:', mycv.mean())
# print('*********************')
# for x in sorted_dict:
#     #print (x[0],x[1])
#     #Drop the current least important feature
#     #Ensure we have at least one feature left
#     if len(X.columns) > 1:
#         X = X.drop(columns = [x[0]])

#     #Fit the model without the feature
#     model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5,random_state=0).fit(X, y)
#     mycv = cross_val_score(model, X, y, cv = 10)
#     #We can see when CV without region_TOWN, the accuracy rate is the highest
#     print(f'CV without {x[0]} is {mycv.mean()}')    

Feature 0: 0.015596
Feature 1: 0.001894
Feature 2: 0.000000
Feature 3: 0.067190
Feature 4: 0.002569
Feature 5: 0.037338
Feature 6: 0.022639
Feature 7: 0.004780
Feature 8: 0.000000
Feature 9: 0.008924
Feature 10: 0.048065
Feature 11: 0.034976
Feature 12: 0.009533
Feature 13: 0.016434
[('children', 0.0671895933178468), ('region_INNER_CITY', 0.04806519023889355), ('save_act', 0.037337882143938295), ('region_RURAL', 0.03497593955579603), ('current_act', 0.02263898929701913), ('region_TOWN', 0.016433912239000925), ('age', 0.015595932816462543), ('region_SUBURBAN', 0.009532940081643293), ('sex_MALE', 0.008923852410055666), ('mortgage', 0.004779755765959015), ('car', 0.0025691370001930114), ('income', 0.0018936945902052749), ('married', 0.0), ('sex_FEMALE', 0.0)]
CV with all attributes: 0.8433333333333334
*********************
CV without children is 0.6166666666666667
CV without region_INNER_CITY is 0.8433333333333334
CV without save_act is 0.8333333333333334
CV without region_RURAL is 0.8416

Wrapper

In [8]:
#RFE is for feature ranking here, one of the wrapper methods
from sklearn.feature_selection import RFE

X = training_df_dummy.drop(columns = ['pep'])
y = training_df_dummy.pep
model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5).fit(X, y)

selector = RFE(model, n_features_to_select=2, step=1)
selector.fit(X, y)

# Print the ranking of each feature
for i, rank in enumerate(selector.ranking_):
    print("Feature %d: Rank %d" % (i, rank))

Feature 0: Rank 5
Feature 1: Rank 1
Feature 2: Rank 3
Feature 3: Rank 1
Feature 4: Rank 13
Feature 5: Rank 4
Feature 6: Rank 12
Feature 7: Rank 2
Feature 8: Rank 11
Feature 9: Rank 10
Feature 10: Rank 6
Feature 11: Rank 9
Feature 12: Rank 8
Feature 13: Rank 7


In [9]:
# Import the libraries for feature selection based on wrapper methods
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score

X = training_df_dummy.drop(columns = ['pep'])
y = training_df_dummy.pep
folds=3


#Classification tree with all features
model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5,random_state=0).fit(X, y)
#Print the mean AUC score with all features
print('AUC with all features: ', cross_val_score(model, X, y, cv=folds, scoring='roc_auc').mean())
#Store mean AUC score based on predicitons made from cross validation
#Need [:, 1] to select the probability of being positive
a=roc_auc_score(y, cross_val_predict(model, X, y,method='predict_proba', cv=folds)[:, 1])
#The predicitons made from cross validation
b= cross_val_predict(model, X, y,method='predict_proba', cv=folds)


#Backward feature selection model, need the tree model first
sfs = SFS(model,direction='backward',scoring='roc_auc', cv=folds)
#Get the names of all features
feature_names=list(X.columns)

#Feature selection based on our data
sfs1 = sfs.fit(X, y)
#Change X to include only the selected features in X1
X1=sfs1.transform(X)
#Store the selected features' names
fn=[ feature_names[i] for i, f in enumerate(sfs1.get_support()) if f ]

#Classification tree with only the selected features
model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5,random_state=0).fit(X1, y)
#Print the mean AUC score with only the selected features
print('AUC with only the selected features: ', cross_val_score(model, X1, y, cv=folds,scoring='roc_auc').mean())
#Store mean AUC score based on predicitons made from cross validation
a1=roc_auc_score(y, cross_val_predict(model, X1, y,method='predict_proba', cv=folds)[:, 1])
#The predicitons made from cross validation
b1= cross_val_predict(model, X1, y,method='predict_proba', cv=folds)


print('Prediction from CV, AUC with all features: ', a)
print('Prediction from CV, AUC with selected features: ', a1)
print('Selected features: ', fn)

AUC with all features:  0.8589720776805274




AUC with only the selected features:  0.8750020918906373
Prediction from CV, AUC with all features:  0.8561864672428463
Prediction from CV, AUC with selected features:  0.8726825937038198
Selected features:  ['income', 'married', 'children', 'car', 'save_act', 'mortgage', 'region_INNER_CITY']


In [10]:
b

array([[0.12121212, 0.87878788],
       [0.875     , 0.125     ],
       [0.89473684, 0.10526316],
       ...,
       [0.89473684, 0.10526316],
       [0.45      , 0.55      ],
       [0.57142857, 0.42857143]])

In [11]:
b1

array([[0.12121212, 0.87878788],
       [0.875     , 0.125     ],
       [0.83333333, 0.16666667],
       ...,
       [0.89473684, 0.10526316],
       [0.45      , 0.55      ],
       [0.57142857, 0.42857143]])