Tesing on binary classification problems and have data with nominal-valued attributes and no missing
values (weather.nominal, titanic, vote.noUnknowns

In [218]:
import pandas as pd
import numpy as np

In [219]:
def readArff(filename):
    with open ('./NominalData/'+filename+'.arff', 'r') as f:
        # split lines, remove ones with comments
        lines = [line.lower() for line in f.read().split('\n') if not line.startswith('%')]
        
    # remove empty lines
    lines = [line for line in lines if line != '']
    
    columns = []
    data = []
    for index, line in enumerate(lines):
        if line.startswith('@attribute'):
            columns.append(line)
            
        if line.startswith('@data'):
            # get the rest of the lines excluding the one that says @data
            data = lines[index+1:]
            break
            
    # clean column names -- '@attribute colname  \t\t\t{a, b, ...}'
    cleaned_columns = [c[11:c.index('{')].strip() for c in columns]
    
    # clean and split data
    cleaned_data = [d.replace(', ', ',').split(',') for d in data]
    
    # create dataframe
    return pd.DataFrame(cleaned_data, columns = cleaned_columns)

In [220]:
vals = readArff('weather.nominal')

In [221]:
vals

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [222]:
def preprocess_data(df):
    # change class values to {-1, 1}
    y, unique = pd.factorize(df.iloc[:,-1])
    new_y = np.where(y==0, -1, 1)
    assert set(new_y) == {-1, 1}, 'Response variable must be ±1'
    
    # change xs to 2d numpy array
    xs = pd.get_dummies(df.iloc[:,:-1])
    xs = xs.values
    
    return xs, new_y

In [223]:
X,y = preprocess_data(vals)
X,y

(array([[0, 0, 1, 0, 1, 0, 1, 0, 1, 0],
        [0, 0, 1, 0, 1, 0, 1, 0, 0, 1],
        [1, 0, 0, 0, 1, 0, 1, 0, 1, 0],
        [0, 1, 0, 0, 0, 1, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 0, 0, 1, 1, 0],
        [0, 1, 0, 1, 0, 0, 0, 1, 0, 1],
        [1, 0, 0, 1, 0, 0, 0, 1, 0, 1],
        [0, 0, 1, 0, 0, 1, 1, 0, 1, 0],
        [0, 0, 1, 1, 0, 0, 0, 1, 1, 0],
        [0, 1, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 0, 1, 0, 0, 1, 0, 1, 0, 1],
        [1, 0, 0, 0, 0, 1, 1, 0, 0, 1],
        [1, 0, 0, 0, 1, 0, 0, 1, 1, 0],
        [0, 1, 0, 0, 0, 1, 1, 0, 0, 1]], dtype=uint8),
 array([-1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1]))

In [226]:
def train(X, y):
    T = 1 # T = 10
    
    n_instances, n_features = np.shape(X)
    weights = np.zeros(shape=(T, n_instances))

    # initialize weights uniformly
    weights[0] = np.ones(shape=n_instances) / n_instances
    
    for t in range(T):
        current_weights = weights[t]
        
        min_error = float('inf')
        
        # Iterate throught every unique feature value and see what value 
        # makes the best threshold for predicting y
        for f in range(n_features):
            unique_values = np.unique(X[:, f])
            
            assert np.array_equal(unique_values, [0,1]) # sanity check
            
            # Try every unique feature value as threshold for decision stump for nominal attributes???
            for x_i in unique_values:
                prediction = np.ones(np.shape(y))
                
                prediction[X[:, f] < x_i] = -1
                print(y != prediction)
        

In [None]:
train(X,y)