In [1]:
### import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import aix360.algorithms.rbm
from aix360.algorithms.rbm import FeatureBinarizer
from aix360.algorithms.rbm import BooleanRuleCG

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.metrics import confusion_matrix
# ignore warning related to deprecated modules inside packages
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
# train with fixed seed to eliminate different results
np.random.seed(777)

In [2]:
data = pd.read_csv("../datasets/Placement_Data_Full_Class.csv",
                    dtype={"sl_no": int,
                            "gender": 'category',
                            "ssc_p" : float,
                            "ssc_b" : 'category',
                            "hsc_p" : float,
                            "hsc_b" : 'category',
                            "hsc_s" : 'category',
                            "degree_p" : float,
                            "degree_t" : 'category',
                            "workex" : 'category',
                            "etest_p" : float,
                            "specialisation" : 'category',
                            "mba_p" : float,
                            "status" : 'category',
                            "salary" : float
                           })


In [3]:
data = data.drop(columns=['salary'])

In [4]:
categorical_features = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
fb = FeatureBinarizer(colCateg=categorical_features, negations=True, returnOrd=True)

X = data.drop(columns=["sl_no", "status"])
X_bin, X_std = fb.fit_transform(X)

In [5]:
Y = data["status"].map(lambda x: 1 if x == "Placed" else 0).astype(int)

In [6]:
def explain_with_BRCG(X, y, CNF=False, lambda0=1e-1, lambda1=1e-2, verbose=True):
    # Instantiate BRCG with small complexity penalty
    br_model = BooleanRuleCG(lambda0, lambda1, CNF=False)
    # Train, print, and evaluate model
    br_model.fit(X, y)
    if verbose:
        print('Training accuracy:', metrics.accuracy_score(y, br_model.predict(X)))
    if br_model.CNF:
        print('Predict Y=0 if ANY of the following rules are satisfied, otherwise Y=1:')
    else:
        print('Predict Y=1 if ANY of the following rules are satisfied, otherwise Y=0:')
    print(br_model.explain()['rules'])
    return br_model

In [7]:
data.groupby(by="status").size()["Placed"]

148

# Subtracting dataframes

In [8]:
rules_s = explain_with_BRCG(X_bin, Y, lambda0=0.01, lambda1=0.01)

Learning DNF rule with complexity parameters lambda0=0.01, lambda1=0.01
Initial LP solved
Iteration: 1, Objective: 0.2060
Iteration: 2, Objective: 0.1946
Iteration: 3, Objective: 0.1881
Training accuracy: 0.8418604651162791
Predict Y=1 if ANY of the following rules are satisfied, otherwise Y=0:
['ssc_p > 58.00 AND hsc_p > 52.00']


In [9]:
first_filter = pd.DataFrame(columns=data.columns)

for i, sample in X_bin.iterrows():
    if(rules_s.predict(sample) == 1):
        first_filter.loc[i] = data.loc[i]

In [10]:
Y = []
for i, row in data.iterrows():
    if i in first_filter.index:
        Y.append(1)
    else:
        Y.append(0)

In [11]:
Y

[1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1]

# Figuring out which lambda is which

In [12]:
import seaborn as sns 
from sklearn.metrics import confusion_matrix

def create_confusion_matrix(y, y_pred, title_name=""):
    conf_matrix = confusion_matrix(y, y_pred)
    grid_label = ['True Negative','False Positive','False Negative','True Positive']
    grid = []
    for i, count in enumerate(conf_matrix.flatten()):
        count_str = f'samples: {count:.0f}'
        percentages = f'{count/len(y) * 100:.2f}%'
        grid.append(f'{grid_label[i]}\n{count_str}\n{percentages}')
    grid = np.asarray(grid).reshape(2,2)
    conf_matrix = pd.DataFrame(conf_matrix,
    columns=["might stay", "might find a new job"],
    index=["will stay", "will find a new job"])
    ax = sns.heatmap(conf_matrix, annot=grid, fmt='', cmap='Reds', cbar=False)
    ax.set_title(f"{title_name}\nConfusion matrix", pad=40, fontsize=17)
    ax.xaxis.set_ticks_position('top')
    return ax

In [13]:
## elso jo parameter:
l0 = 0.00000000001
l1 = 0.000005

In [14]:
rules_l0 = explain_with_BRCG(X_bin, Y, lambda0=1, lambda1=0.0000000000001)

Learning DNF rule with complexity parameters lambda0=1, lambda1=1e-13


TypeError: '>' not supported between instances of 'list' and 'float'

In [None]:
create_confusion_matrix(Y, rules_l0.predict(X_bin))

# Filtering data according to the ruleset

In [None]:
rules_l0.predict(X_bin.loc[0])

In [None]:
first_filter = pd.DataFrame(columns=data.columns)
first_filter.head()

In [None]:
for i, sample in X_bin.iterrows():
    if(rules_l0.predict(sample) == 1):
        first_filter.loc[i] = data.loc[i]

In [None]:
data

In [None]:
first_filter

# Binarizing the rest

In [None]:
X1 = first_filter.drop(columns=["sl_no", "status"])
X1_bin, X1_std = fb.fit_transform(X1)

In [None]:
Y1 = first_filter["status"].map(lambda x: 1 if x == "Placed" else 0).astype(int)

# Calculating the second layer

In [None]:
rules_l1 = explain_with_BRCG(X1_bin, Y1, lambda0=0.0000001, lambda1=0.001)

In [None]:
create_confusion_matrix(Y1, rules_l1.predict(X1_bin))

# Building a framework for it

In [None]:
class NestedCavities:
    def __init__(self, dataset, cat_list, drop_list, target_n, target_v, l1_null, l1_inf, l2_init, l2_rate, max_iter):
        self.df = dataset
        self.categories = cat_list
        self.to_drop = drop_list
        self.target_n = target_n
        self.target_v = target_v
        self.l1_bot = l1_null
        self.l1_top = l1_inf
        self.l2 = l2_init
        self.l2_rate = l2_rate
        self.max_iter = max_iter
        
        
    def process(self, data, cat_list, drop_list, target_n, target_v, pos):
        fb = FeatureBinarizer(colCateg=cat_list, negations=True, returnOrd=True)
        X = data.drop(columns=drop_list)
        X_bin, X_std = fb.fit_transform(X)
        
        if pos:
            Y = data[target_n].map(lambda x: 1 if x == target_v else 0).astype(int)
        else:
            Y = data[target_n].map(lambda x: 1 if x != target_v else 0).astype(int)
        return X_bin, Y
        
    def fit_cavity(self, bin_v, target_v, l0, l1):
        
        br_model = BooleanRuleCG(l0, l1, CNF=False)
        # Train, print, and evaluate model
        br_model.fit(bin_v, target_v)
        return br_model
    
    def filter_data(self, dataset, bin_v, rules, pos):

        target = 1 if pos else 0
        filtered_df = pd.DataFrame(columns=dataset.columns)
        
        for i, sample in bin_v.iterrows():
            if(rules.predict(sample) == target):
                filtered_df.loc[i] = dataset.loc[i]
                
        return filtered_df
    
    def subtract_rules(self, ruleset, neg_rules):
        pass
    
    def good_enough(self, prev_set, current_set, n_iter):
        # gondolkodni kell rajta:
        """
        - detektalni mikor ugyanazt rakjuk ki es be
          - osszevetni az elozo iteracioval az eredmeny halmazt, ha ugyanaz vege
        - osszes eredetileg pozitivkent jelolt mintat kivalasztottuk (valoszinuleg tulilleszt)
          - szamoljuk meg az algoritmus elejen hany pozitiv van
          - ha annyi pozitiv van a mi halmazunkban, es az osszes pozitiv akkor vege
        - szabalyok komplexitasara megkotes?
          - szabalyok komplexitasa ~ aix
        """
        
        if n_iter == self.max_iter or current_set.equals(prev_set):
            return True
        else:
            return False
    
    def evaluate(self, filtered_data):
        Y_pred = []
        for i, row in self.df.iterrows():
            if i in filtered_data.index:
                Y_pred.append(1)
            else:
                Y_pred.append(0)
        
        Y_org = self.df[self.target_n].map(lambda x: 1 if x == self.target_v else 0).astype(int)
        stats = [] 
        conf_matrix = confusion_matrix(Y_org, Y_pred)
        for i, count in enumerate(conf_matrix.flatten()):
            stats.append(count)
            
        return (stats, len(Y_org))
    
    def rule_complexity(self, rules):
        c = 0
        for rule in rules:
            c += self.l2 
            for term in rule:
                c += self.l1_top 
                
        return c
            
    
    def learn(self):
        ruleset = []
        conf_stats = []
        prev_samples = pd.DataFrame()
        pos_class_samples = pd.DataFrame(columns=self.df.columns)
        data = self.df
        n_iter = 0
        while not self.good_enough(prev_samples, pos_class_samples, n_iter):
            # getting all the positives + some false positives
            tr_X, tr_Y = self.process(data, self.categories, self.to_drop, self.target_n, self.target_v, True)
            
            if tr_Y.sum() == 0:
                print("No more positive samples found among the remaining samples I.")
                print(f"Phase {n_iter} / I.")
                break
            rules = self.fit_cavity(tr_X, tr_Y, self.l2, self.l1_top)
            filtered_data = self.filter_data(data, tr_X, rules, True)
            
            ruleset += rules.explain()['rules']
            print("Compound rule complexity")
            print(self.rule_complexity(ruleset))
            
            # removing all the negatives + some false negatives
            tr_X, tr_Y = self.process(filtered_data, self.categories, self.to_drop, self.target_n, self.target_v, False)
            if tr_Y.sum() == 0:
                print("No more positive samples found among the remaining samples.")
                print(f"Phase {n_iter} / II.")
                break
            rules = self.fit_cavity(tr_X, tr_Y, self.l2, self.l1_top)
            print("Selector rule complexity")
            print(self.rule_complexity(rules.explain()['rules']))
            
            filtered_data2 = self.filter_data(filtered_data, tr_X, rules, False)
            removed_data = self.filter_data(filtered_data, tr_X, rules, True)
            
            self.subtract_rules(ruleset, rules)
            
            # end of the iteration
            prev_samples = pos_class_samples
            pos_class_samples = pd.concat([pos_class_samples, filtered_data2])
            self.l2 *= self.l2_rate
            data = removed_data
            conf_stats.append(self.evaluate(pos_class_samples))
            n_iter += 1
                
        return pos_class_samples, ruleset, conf_stats
        

In [None]:
cat_list = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
drop_list = ["sl_no", "status", "salary"]
target_n = "status"
target_v = "Placed"
# egyelőre..
l1_null = None
l1_inf = 0.000005
l2_init = 0.00000000001
l2_rate = 0.1
max_iter = 10
nc = NestedCavities(data, cat_list, drop_list, target_n, target_v, l1_null, l1_inf, l2_init, l2_rate, max_iter)

In [None]:
pos_class_samples, _, conf_stats = nc.learn()

In [None]:
conf_stats