In [None]:
!pip install -q arff

In [None]:
!rm -rf 'bank-additional-full'

In [None]:
# -*- coding: utf-8 -*-
import arff
import re
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
import random
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle 
from sklearn import preprocessing
import shutil
from scipy.io import arff
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
class ContextBasedRandomForestClassifierEnsemble:  

    
    #user should not see these
    def __init__(self, target_feature, files_path):
        self.target_feature = target_feature
        self.files_path = os.getcwd() + '/' + files_path
        self.models = {}
        self.numerical_contexts = []
        self.non_numeric_contexts = []
        self.context_models = {}
        self.contexts = []
        self.intervals = {}
        self.encoders = {}
    
    def set_numerical_contexts(self, numerical_contexts = []):
        self.numerical_contexts = numerical_contexts
             
    def set_non_numeric_contexts(self, non_numeric_contexts):
        self.non_numeric_contexts = non_numeric_contexts
        
    def set_contexts(self, df, manually = False, contexts=[]):
        if(manually):
            self.contexts = contexts
            return

        self.contexts = list(df.columns.values)
        self.contexts.remove(self.target_feature)
        for feature in self.contexts:
            if np.issubdtype(df[feature].dtype, np.number):
                self.numerical_contexts.append(feature)
            else:
                self.non_numeric_contexts.append(feature)
                
    
    
    #new_dataset_path is the path from the current directory
    def create_dataset_by_non_numeric_feature(self, feature_name, feature_values, df):
        path = self.files_path + '/datasets/' + feature_name
    
        if not os.path.exists(path):
            os.makedirs(path)
        for value in feature_values:
            df1 = df[df[feature_name]==value]
            X = df1.drop(feature_name,1)
            self.create_dataset_file(X, path, value)
    
    
    def create_dataset_by_numeric_feature(self, feature_name, df):
        path = self.files_path + '/datasets/' + feature_name
        useRanking = False
        if not os.path.exists(path):
            os.makedirs(path)
        
        try:
            df_num = pd.qcut(df[feature_name], 3, retbins = True)
        except ValueError:
            df['rank'] = df[feature_name].rank(method='first')
            df_num = pd.qcut(df['rank'], 3, retbins = True)
            cut_offs = df_num[1]
            useRanking = True
            
        
        cut_offs = df_num[1]
        if useRanking:
            df_low = df[(cut_offs[0] < df['rank']) & (df['rank'] <= cut_offs[1])]
            df_low.pop('rank')
            df_medium = df[(cut_offs[1] < df['rank']) & (df['rank'] <= cut_offs[2])] 
            df_medium.pop('rank')
            df_high = df[(cut_offs[2] < df['rank']) & (df['rank'] <= cut_offs[3])] 
            df_high.pop('rank')
            df_num = pd.cut(df[feature_name], 3, retbins = True)
            cut_offs = df_num[1]
            df.pop('rank')
        else:
            df_low = df[(cut_offs[0] < df[feature_name]) & (df[feature_name] <= cut_offs[1])]
            df_medium = df[(cut_offs[1] < df[feature_name]) & (df[feature_name] <= cut_offs[2])] 
            df_high = df[(cut_offs[2] < df[feature_name]) & (df[feature_name] <= cut_offs[3])] 
        self.create_dataset_file(df_low, path, 'low')
        self.create_dataset_file(df_medium, path, 'medium')
        self.create_dataset_file(df_high, path, 'high')
        self.intervals[feature_name] =  list(cut_offs)
    
    
    def create_dataset_file(self, df, new_dataset_path, file_name=''):
        if not os.path.isfile('%s/%s.csv'% (new_dataset_path,file_name)):
            df.to_csv(r'%s/%s.csv'% (new_dataset_path,file_name), index=None)
        else: # else it exists so append without writing the header
            df.to_csv(r'%s/%s.csv'% (new_dataset_path,file_name), mode='a', header=False, index=None)
            
    def split_dataset(self, df, manually = False, contexts = [], datasets_path='', number_of_intervals = 3, intervals = {}):
        X = df.copy()
        if manually:
            return
        else:
            for attribute in self.non_numeric_contexts:
                self.encoders[attribute] = LabelEncoder()
                enc = df[attribute].str.decode('utf-8')
                X[attribute + '_encoded'] = self.encoders[attribute].fit_transform(df[attribute])
                X.pop(attribute)

            for attribute in self.contexts:
                if np.issubdtype(df[attribute].dtype, np.number):
                    self.create_dataset_by_numeric_feature(attribute, X)
                else:
                    values = X[attribute + '_encoded'].unique()
                    self.create_dataset_by_non_numeric_feature(attribute + '_encoded', values, X)

    def separate_encoded_columns(self, df):
        temp = pd.DataFrame()
        for column in self.non_numeric_contexts:
            temp[column] = df.pop(column + '_encoded')
        return temp, df
    
    def separate_non_numeric_columns(self, df):
        temp = pd.DataFrame()
        for column in self.non_numeric_contexts:
            temp[column] = df.pop(column)
        return temp, df
    #training the contexts
    def create_context_models(self, df):   
    #create models for each of the contexts      
        _path = self.files_path + '/models'
        for attribute in self.contexts:

        #if the feature is numerical
            if np.issubdtype(df[attribute].dtype, np.number):
                
                scaler_high = preprocessing.StandardScaler()
                scaler_low = preprocessing.StandardScaler()
                scaler_medium = preprocessing.StandardScaler()
                
                dataset_path = self.files_path + '/datasets/'+ attribute
                model_path = _path + '/' + attribute
                if not os.path.exists(model_path):
                    os.makedirs(model_path)
                    
        #we read in the datasets for each feature
                data_high = pd.read_csv('%s/%s.csv'% (dataset_path,"high"))
                data_medium = pd.read_csv('%s/%s.csv'% (dataset_path,"medium"))
                data_low = pd.read_csv('%s/%s.csv'% (dataset_path,"low"))
                
        #make dataframes
                df_high = pd.DataFrame(data_high)
                df_low = pd.DataFrame(data_low)
                df_medium = pd.DataFrame(data_medium)
                
        #pop the nominal encoded column and the target
                y_high = df_high.pop(self.target_feature)
                y_low = df_low.pop(self.target_feature)
                y_medium = df_medium.pop(self.target_feature)
                dropped_columns_high, reduced_df_high  = self.separate_encoded_columns(df_high)
                dropped_columns_low, reduced_df_low = self.separate_encoded_columns(df_low)
                dropped_columns_medium, reduced_df_medium = self.separate_encoded_columns(df_medium)
                
                names = reduced_df_high.columns
                
        #scale the values
                scaled_df_high = scaler_high.fit_transform(reduced_df_high)
                scaled_df_low = scaler_low.fit_transform(reduced_df_low)
                scaled_df_medium = scaler_medium.fit_transform(reduced_df_medium)
                
                df_high = pd.DataFrame(scaled_df_high, columns=names)
                df_low = pd.DataFrame(scaled_df_low, columns=names)
                df_medium = pd.DataFrame(scaled_df_medium, columns=names)
                
                df_high = pd.concat([df_high,dropped_columns_high], axis=1).reindex(df_high.index)
                df_low = pd.concat([df_low,dropped_columns_low], axis=1).reindex(df_low.index)
                df_medium = pd.concat([df_medium,dropped_columns_medium], axis=1).reindex(df_medium.index)
                
            
        #create models and train them
                high_model = RandomForestClassifier(max_depth=2, random_state=0)
                low_model = RandomForestClassifier(max_depth=2, random_state=0)
                medium_model = RandomForestClassifier(max_depth=2, random_state=0)
                
                high_model.fit(df_high, y_high)
                low_model.fit(df_low, y_low)
                medium_model.fit(df_medium, y_medium)
                
                filename_high = model_path + '/high.sav'
                filename_low = model_path + '/low.sav'
                filename_medium = model_path + '/medium.sav'
                #write the models down in the provided path
                pickle.dump(high_model, open(filename_high, 'wb'))
                pickle.dump(low_model, open(filename_low, 'wb'))
                pickle.dump(medium_model, open(filename_medium, 'wb'))
                
            else:
    #if the feature is nominal
                dataset_path = self.files_path + '/datasets/' + attribute + '_encoded'
                model_path = _path + '/' + attribute + '_encoded'
                values = df[attribute].unique()
                if not os.path.exists(model_path):
                    os.makedirs(model_path)
                    
        #foreach class in the nominal feature
                for val in values:
                    
                    if isinstance(val, str):
                        val_enc = str(self.encoders[attribute].transform([val])).strip("[]")
                    else:
                        val_enc = str(self.encoders[attribute].transform([val.decode('utf-8')])).strip("[]")
                    
                    #read in dataset
                    data_val = pd.read_csv('%s/%s.csv'% (dataset_path,val_enc))
                    df_val = pd.DataFrame(data_val)
                    y_train = df_val.pop(self.target_feature)
                    
                    names = df_val.columns
                    
                    #scale the values
                    scaler = preprocessing.StandardScaler()
                    df_sc_val = scaler.fit_transform(df_val)
                    
                    df_val = pd.DataFrame(df_sc_val, columns=names)
                    if not isinstance(val, str):
                        val_name = val.decode("utf-8")
                    else:
                        val_name = val
                
            #create and train the model
                    model = RandomForestClassifier(max_depth=2, random_state=0)
                    model.fit(df_val, y_train)
                    filename = model_path + '/' + val_name +'.sav'
                
            #write the model down in a file
                
                    pickle.dump(model, open(filename, 'wb')) 
                    
    def create_test_set(self, df):
        
        for non_num in self.non_numeric_contexts:          
            s = df[non_num].loc[4]
            if not isinstance(s, str):
                df[non_num] = df[non_num].str.decode('utf-8')
   
            df[non_num + '_encoded'] = self.encoders[non_num].fit_transform(df[non_num])
        y_test = df.pop(self.target_feature)
        return (df, y_test)
                
    def make_predictions(self, X_test):
        _path = self.files_path + '/models'
#path to models
        predictions = pd.DataFrame()
        X_test.index = list(range(len(X_test)))
        test_copy = X_test.copy()
        #get un-encoded column to use it to get the right model for the nominal class
        non_numeric_columns, test_copy = self.separate_non_numeric_columns(test_copy)
        #scle the test dataset 
        encoded, non_encoded = self.separate_encoded_columns(test_copy)
        scaler = preprocessing.StandardScaler()
        scaled_df = scaler.fit_transform(non_encoded)
        names = non_encoded.columns  
        scaled_df = pd.DataFrame(scaled_df, columns = names)
        scaled_df = pd.concat([scaled_df,encoded], axis=1).reindex(scaled_df.index)
        #foeach context
        for attribute in self.contexts:
            print(attribute)
            array = np.array([])  
        #if its a numerical context
            if np.issubdtype(df[attribute].dtype, np.number):                   
            #foreach row of the test set
                for row_index in range(len(X_test)):
                    model_path = _path + '/' + attribute
                #get value of the row for the current attribute
                    row_val = X_test.loc[row_index, attribute]
                #get row for which we are predicting, but from the scaled test set
                    row = scaled_df.iloc[row_index].to_numpy()
                    
                    filename_high = model_path + '/high.sav'
                    filename_low = model_path + '/low.sav'
                    filename_medium = model_path + '/medium.sav'
                    intervals = self.intervals[attribute]
                    #check within which interval the attribute value belongs so we know which model to load 
                    if ((intervals[0] < row_val) & (row_val <= intervals[1])):
                        loaded_model = pickle.load(open(filename_low, 'rb'))
                    if ((intervals[1] < row_val) & (row_val <= intervals[2])):
                        loaded_model = pickle.load(open(filename_medium, 'rb'))
                    if ((intervals[2] < row_val) & (row_val <= intervals[3])):
                        loaded_model = pickle.load(open(filename_high, 'rb'))
                    #predict value for the current row
                    prediction = loaded_model.predict(pd.DataFrame(row).T)
                    array = np.append(array, prediction)
                    #save value to predictions dictionary
                predictions[attribute] = array
            else: 
                #Foreach row of X_test_non_num dataset
                X_test_non_num = X_test.drop(attribute, 1)
                scaled_df_non_num = scaled_df.drop(attribute,1)
                for row_index in range(len(X_test_non_num)):
                #get name from the non encoded version of the column, from the test set, to know which model we need to load
                #this is from the non scaled version of the test set
                    s = non_numeric_columns[attribute].loc[row_index]
                    
                    if isinstance(s, str):
                        model_name = non_numeric_columns[attribute].loc[row_index]
                    else:
                        model_name = str(non_numeric_columns[attribute].loc[row_index].decode('utf-8'))
                        
                #get row from the scaled version of the test set
                    row = scaled_df_non_num.iloc[row_index].to_numpy()
                    filename = _path + '/' + attribute + '_encoded' + '/' + model_name +'.sav'
                #load model
                    loaded_model = pickle.load(open(filename, 'rb'))
                #make prediction
                    prediction = loaded_model.predict(pd.DataFrame(row).T)
                    array = np.append(array, prediction)
                predictions[attribute] = array
        return predictions

In [None]:
data = pd.read_csv(os.getcwd() +'/bank-additional-full.csv', sep=';')
df = pd.DataFrame(data)
y = df.pop('y')

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=42)
train = pd.concat([X_train,y_train], axis=1).reindex(X_train.index)
test = pd.concat([X_test,y_test], axis=1).reindex(X_test.index)
context_based_rand_forest_ensemble = ContextBasedRandomForestClassifierEnsemble('y', 'bank-additional-full')

context_based_rand_forest_ensemble.set_contexts(train)
            
context_based_rand_forest_ensemble.split_dataset(train)

context_based_rand_forest_ensemble.create_context_models(train)

X_test_proc, y_test_proc = context_based_rand_forest_ensemble.create_test_set(test)

predictions = context_based_rand_forest_ensemble.make_predictions(X_test_proc)       



age
job
marital
education
default
housing
loan
contact
month
day_of_week
duration
campaign
pdays
previous
poutcome
emp.var.rate
cons.price.idx
cons.conf.idx
euribor3m
nr.employed


In [None]:
predictions

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
1,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
2,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
3,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
4,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13588,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
13589,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
13590,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no
13591,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no


In [None]:
y = []
for col in predictions.columns:
  calculated_y = predictions[col]
  y.append(round(accuracy_score(y_test_proc, calculated_y),2)*100)
yy = np.mean(y)
print('CBM predicts with ' + str(yy) + '% accuracy')

CBM predicts with 89.8% accuracy


#Single model
Dolu na istiot dataset, obraboten na ist nacin se trenira single RandomForestClassifier model

In [None]:
data = pd.read_csv(os.getcwd() +'/bank-additional-full.csv', sep=';')
bank_df = pd.DataFrame(data)
y = bank_df.pop('y')

labelencoder_X = LabelEncoder()
bank_df['contact']     = labelencoder_X.fit_transform(bank_df['contact']) 
bank_df['month']       = labelencoder_X.fit_transform(bank_df['month']) 
bank_df['day_of_week'] = labelencoder_X.fit_transform(bank_df['day_of_week']) 
bank_df['job']      = labelencoder_X.fit_transform(bank_df['job']) 
bank_df['marital']  = labelencoder_X.fit_transform(bank_df['marital']) 
bank_df['education']= labelencoder_X.fit_transform(bank_df['education']) 
bank_df['default']  = labelencoder_X.fit_transform(bank_df['default']) 
bank_df['housing']  = labelencoder_X.fit_transform(bank_df['housing']) 
bank_df['loan']     = labelencoder_X.fit_transform(bank_df['loan']) 
bank_df['poutcome']  = labelencoder_X.fit_transform(bank_df['poutcome']) 

X_train, X_test, y_train, y_test = train_test_split(bank_df, y, test_size=0.33, random_state=42)

sc_X = preprocessing.StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(X_train, y_train)
rfcpred = rfc.predict(X_test)

print(confusion_matrix(y_test, rfcpred ))
print(round(accuracy_score(y_test, rfcpred),2)*100)

[[12048     8]
 [ 1476    61]]
89.0
