Tesing on binary classification problems and have data with nominal-valued attributes and no missing
values (weather.nominal, titanic, vote.noUnknowns

In [365]:
import pandas as pd
import numpy as np

In [366]:
def readArff(filename):
    with open ('./NominalData/'+filename+'.arff', 'r') as f:
        # split lines, remove ones with comments
        lines = [line.lower() for line in f.read().split('\n') if not line.startswith('%')]
        
    # remove empty lines
    lines = [line for line in lines if line != '']
    
    columns = []
    data = []
    for index, line in enumerate(lines):
        if line.startswith('@attribute'):
            columns.append(line)
            
        if line.startswith('@data'):
            # get the rest of the lines excluding the one that says @data
            data = lines[index+1:]
            break
            
    # clean column names -- '@attribute colname  \t\t\t{a, b, ...}'
    cleaned_columns = [c[11:c.index('{')].strip() for c in columns]
    
    # clean and split data
    cleaned_data = [d.replace(', ', ',').split(',') for d in data]
    
    # create dataframe
    return pd.DataFrame(cleaned_data, columns = cleaned_columns)

In [367]:
vals = readArff('weather.nominal')

In [368]:
vals

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [403]:
def preprocess_data(df):
    # change class values to {-1, 1}
    y, unique = pd.factorize(df.iloc[:,-1])
    new_y = np.where(y==0, -1, 1)
    assert set(new_y) == {-1, 1}, 'Response variable must be ±1'
    
    # change xs to 2d numpy array
    xs = df.iloc[:,:-1]
    xs = xs.values
    
    return xs, new_y

In [405]:
X,y = preprocess_data(vals)
X,y

(array([['sunny', 'hot', 'high', 'false'],
        ['sunny', 'hot', 'high', 'true'],
        ['overcast', 'hot', 'high', 'false'],
        ['rainy', 'mild', 'high', 'false'],
        ['rainy', 'cool', 'normal', 'false'],
        ['rainy', 'cool', 'normal', 'true'],
        ['overcast', 'cool', 'normal', 'true'],
        ['sunny', 'mild', 'high', 'false'],
        ['sunny', 'cool', 'normal', 'false'],
        ['rainy', 'mild', 'normal', 'false'],
        ['sunny', 'mild', 'normal', 'true'],
        ['overcast', 'mild', 'high', 'true'],
        ['overcast', 'hot', 'normal', 'false'],
        ['rainy', 'mild', 'high', 'true']], dtype=object),
 array([-1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1]))

In [525]:
    # Iterate throught every unique feature value and see what value makes the best prediction for y
    # We use information gain as the metric for picking the best attribute
#         unique_values = np.unique(X[:, f])

        # We need to calcualte the weak hypothesis here
        
        # how do we select which attribute to use to update the weights? 
            # smallest classification error or information gain or entropy?
            # use the information gain -- cut it off after 1
            # pick highest information gain
            # calculate error
            # think of error of learned model as seperate from process of learning weak model
            # error is evaluation of that model
            
        # given feature, calculate information gain
        # then calculate error, keep attribute with smallest error
        
        

In [707]:
class DecisionStump():
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.n_features = np.shape(self.X)[1]
        self.info_gain = None
        self.error = None
        self.best_attribute = None
        self.tree = dict()
        self.predictions = None

    def __str__(self):
        return f"""information_gain: {self.info_gain}, error: {self.error}"""
    
    
    def _entropy(self, col):
        """
        Calculate the entropy with respect to the target column.
        """
        vals, counts = np.unique(col, return_counts = True)

        entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) 
                          for i in range(len(vals))])
        return entropy
    
    
    def _information_gain(self, attr): 
        # calculate the entropy of the total dataset
        total_entropy = self._entropy(self.y)

        # calculate the sum of the weighted entropy of the attributes
        vals, counts = np.unique(attr, return_counts=True)


        weighted_entropy = sum([(counts[i]/np.sum(counts)) * 
                            self._entropy(self.y[(attr == vals[i])]) for i in range(len(vals))])

        # calculate information gain
        info_gain = total_entropy - weighted_entropy
        return info_gain
    
    def _make_tree(self):
        # predict values based on self.best_attribute
        attr = self.X[:, self.best_attribute]
        vals, counts = np.unique(attr, return_counts=True)
        
        # tree = {attr_val1: p(1), attr_val2, p(1)}
        # keys represent branches, values represent probability of 1
        # we know the y's are {-1, 1}
        for val in vals:
            subset = self.y[(attr == val)]
            new_subset = np.where(subset == -1, 0, 1) # replace -1 with 0
            prob = sum(new_subset) / len(new_subset)
            self.tree[val] = prob
            
    def _predict(self): 
        # predict values based on self.best_attribute
        attr = self.X[:, self.best_attribute]
        self.predictions = np.ones(np.shape(self.y))

        for i, x_i in enumerate(attr):
            if self.tree[x_i] < 0.5:
                self.predictions[i] = -1
            # if == 0.5 then could break tie with majority over everything -- add in at the end
    
    
    def _calculate_error(self, w):        
        self._make_tree()
        self._predict()
        
        # calculate percent inaccuracy
        assert np.shape(self.predictions) == np.shape(self.y) # sanity check
        n_total = len(self.predictions)
        n_incorrect = sum([self.predictions[i] != self.y[i] for i in range(n_total)])
        self.error = n_incorrect / n_total
    
    def learn(self, w):
        max_gain = float('-inf')
        
        for f in range(self.n_features):
            gain = self._information_gain(self.X[:, f])
            
            if max_gain < gain:
                self.info_gain = gain
                self.best_attribute = f
                max_gain = gain
        self._calculate_error(w)

In [708]:
def train(X, y):
    T = 10
    
    n_instances = np.shape(X)[0]
    weights = np.zeros(shape=(T, n_instances))
    stumps = np.zeros(shape=T, dtype=object)
    alphas = np.zeros(shape=T)
    errors = np.zeros(shape=T)
    
    # initialize weights uniformly
    weights[0] = np.ones(shape=n_instances) / n_instances
    
    for t in range(T):
        current_weights = weights[t]
        print(current_weights)
        
        learner = DecisionStump(X, y)
        learner.learn(current_weights)
                            
        alpha = np.log((1 - learner.error) / learner.error) / 2
        
        new_weights = (current_weights * np.exp(-alpha * y * learner.predictions))
        new_weights /= new_weights.sum()
        
        # If not final iteration, update sample weights for t+1
        if t+1 < T:
            weights[t+1] = new_weights

        stumps[t] = learner
        alphas[t] = alpha
        errors[t] = learner.error
        print(learner)

In [709]:
train(X, y)

[0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
 0.07142857 0.07142857]
information_gain: 0.24674981977443933, error: 0.2857142857142857
[0.05  0.05  0.05  0.05  0.05  0.125 0.05  0.05  0.125 0.05  0.125 0.05
 0.05  0.125]
information_gain: 0.24674981977443933, error: 0.2857142857142857
[0.02857143 0.02857143 0.02857143 0.02857143 0.02857143 0.17857143
 0.02857143 0.02857143 0.17857143 0.02857143 0.17857143 0.02857143
 0.02857143 0.17857143]
information_gain: 0.24674981977443933, error: 0.2857142857142857
[0.0137931  0.0137931  0.0137931  0.0137931  0.0137931  0.21551724
 0.0137931  0.0137931  0.21551724 0.0137931  0.21551724 0.0137931
 0.0137931  0.21551724]
information_gain: 0.24674981977443933, error: 0.2857142857142857
[0.00601504 0.00601504 0.00601504 0.00601504 0.00601504 0.23496241
 0.00601504 0.00601504 0.23496241 0.00601504 0.23496241 0.00601504
 0.00601504 0.23496241]
information_gain: 0.246

In [551]:
X[:,[1,3]]

array([['hot', 'false'],
       ['hot', 'true'],
       ['hot', 'false'],
       ['mild', 'false'],
       ['cool', 'false'],
       ['cool', 'true'],
       ['cool', 'true'],
       ['mild', 'false'],
       ['cool', 'false'],
       ['mild', 'false'],
       ['mild', 'true'],
       ['mild', 'true'],
       ['hot', 'false'],
       ['mild', 'true']], dtype=object)