In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

# Reading Data

In [2]:
# reading dataset
df = pd.read_csv('datasets/divorce.csv',sep=';')
df.head()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


In [3]:
# setting class type as category
df['Class'] = df.Class.astype('category')

In [4]:
df.tail()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
165,0,0,0,0,0,0,0,0,0,0,...,1,0,4,1,1,4,2,2,2,0
166,0,0,0,0,0,0,0,0,0,0,...,4,1,2,2,2,2,3,2,2,0
167,1,1,0,0,0,0,0,0,0,1,...,3,0,2,0,1,1,3,0,0,0
168,0,0,0,0,0,0,0,0,0,0,...,3,3,2,2,3,2,4,3,1,0
169,0,0,0,0,0,0,0,1,0,0,...,3,4,4,0,1,3,3,3,1,0


In [5]:
# accuracy metrics (correctley predicted)/(total)
def accuracy(y_pred,y_actual):
    return ((y_actual == y_pred)).sum()/len(y_actual)

In [6]:
# column name of traget variable
label = 'Class'

In [7]:
# function that returns average accuracy after applying 5 fold cross validation on given data with random forest model
def CV(data):
    
    K = 5 # Value of K in K fold
    avg =0 
    
    # from 0 to k-1
    for i in range(K):
        
        # special case when i==k-1
        if i == K-1:
             # iterate through all unique values in target variable
            for c in list(np.unique(df[label])):
                # test_size = total_data/K (example : 20/5 = 4 <-- 4 instance in test set)
                test_size = len(data[df[label]==c])/K
                
                # select dataframe with label as c
                df_of_c = data[df[label]==c]
                # select i*test_size to end (exampel : for i=4 --> [4*4 to end]=>[16 to end(:)] )
                test = df_of_c[int(i*test_size):].copy()
        else:
            # iterate through all unique values in target variable
            for c in list(np.unique(df[label])):
                
                # test_size = total_data/K (example : 20/5 = 4 <-- 4 instance in test set)
                test_size = len(data[df[label]==c])/K
                
                # select dataframe with label as c
                df_of_c = data[df[label]==c]
                # select i*test_size to (i+1)*test_size (exampel : for i=0 --> [0*4 to (0+1)*4]=>[0 to 4] )
                test = df_of_c[int(i*test_size):int((i+1)*test_size)].copy()
                
        
        # except instances in test all other instances will be in train
        train = data[~data.isin(test).all(1)].copy()
        
        # initialize random forest
        model = RandomForestClassifier(n_estimators=3,min_samples_leaf=15, random_state=0)
        # fit train data
        model.fit(train.drop(label,axis=1),train[label])
        
        # predict output for test
        P = model.predict(test.drop(label,axis=1))
        
        # get accuracy
        acc = accuracy(P,test[label].values)
        
        avg+=acc
    
    # calculate average accuracy
    avg=avg/K
    
    return avg

# Sequential forward Search (SFS)

In [8]:
# list that contains names of column
features = list(df.columns)[0:-1]

acc_best = 0 # best accuracy till now
Y = []  # final feature list

# while Y < all features
while len(Y) < len(features):
    
    # lst stores accuracy for each feature
    lst = []
    
    # iterate through each feature
    for f in features:
        
        # if feature is in the Y then give its accuray as -1 so that when selecting feature with max accuracy it does not get selected
        if f in Y:
            lst.append(-1)
            
        else:
            # add f to Y and get the accuracy
            Y_temp = Y.copy()
            Y_temp.append(f)
            Y_temp.append(label)
            
            # get 5 fold CV accuacy for feature
            f_acc =CV(df[Y_temp])
            #print(f,CV(df[Y_temp]))
            
            # add f_acc to lst
            lst.append(f_acc)
    
    # if no improvment then break
    if(acc_best > lst[np.argmax(lst)]):
        break
        
    # select feature with best accuracy    
    feat_best = features[np.argmax(lst)]
    
    print('best feature selected : ',feat_best)
    
    # add best feature
    Y.append(feat_best)
    
    
print('Final feature list:\n', Y)

best feature selected :  Atr10
best feature selected :  Atr7
best feature selected :  Atr1
best feature selected :  Atr2
best feature selected :  Atr14
best feature selected :  Atr3
best feature selected :  Atr5
best feature selected :  Atr4
best feature selected :  Atr6
best feature selected :  Atr17
best feature selected :  Atr8
best feature selected :  Atr12
best feature selected :  Atr9
best feature selected :  Atr26
best feature selected :  Atr15
best feature selected :  Atr19
best feature selected :  Atr11
best feature selected :  Atr13
best feature selected :  Atr18
best feature selected :  Atr16
best feature selected :  Atr20
best feature selected :  Atr21
best feature selected :  Atr22
best feature selected :  Atr24
best feature selected :  Atr36
best feature selected :  Atr23
best feature selected :  Atr25
best feature selected :  Atr40
best feature selected :  Atr30
best feature selected :  Atr27
best feature selected :  Atr28
best feature selected :  Atr29
best feature sele