## From-Scratch Naive Bayes Classifier

In [12]:
import pandas as pd
from collections import Counter

### Read Data

In [13]:
data = pd.read_csv('./data/tennis_train.csv')
targets = data.pop('play') # remove labels
data.head()

Unnamed: 0,outlook,temp,humidity,windy
0,sunny,hot,high,False
1,sunny,hot,high,True
2,overcast,hot,high,False
3,rainy,mild,high,False
4,rainy,cool,normal,False


In [50]:
# Generate input file for testing
data.to_csv("input.csv", index=False)

### Determine P(Yes) and P(No)

In [15]:
# Get occurances of yes or no
target_occurances = dict(Counter(targets))  # Store yes and no count in dictionary
target_occurances

{'no': 4, 'yes': 7}

In [4]:
# Convert to probabilities (determine P(yes) and P(no))
target_prob = {}
target_prob['yes'] = target_occurances['yes'] / len(targets)
target_prob['no'] = target_occurances['no'] / len(targets)
target_prob

{'yes': 0.6363636363636364, 'no': 0.36363636363636365}

### Determine P(ai|vj) for all events ai

In [16]:
# Create a dictionary describing P(ai | vj)
attribute_prob = dict((col,{}) for col in data.columns)
attribute_prob

{'outlook': {}, 'temp': {}, 'humidity': {}, 'windy': {}}

In [17]:
# For each attribute (column) of the data
for attr in data:
    events = set(data[attr])
    a = len(events) # |a| -  the cardinality of this type of attribute
    
    for event in events:
        
        # number of training examples for which v = vj
        num_yes = len(data[(targets == 'yes')])
        num_no = len(data[(targets == 'no')])

        #number of examples for which v = vj and a = ai
        nc_yes = len(data[(data[attr] == event) & (targets == 'yes')])
        nc_no = len(data[(data[attr] == event) & (targets == 'no')])
        
        # Calculate probability for each event, with laplace correction
        prob_yes = (nc_yes + 1) / (num_yes + a) # nc + 1/ n + |a|
        prob_no = (nc_no + 1) / (num_no + a) # nc + 1/ n + |a|
        
        # Add to dictionary
        attribute_prob[attr][event] = {}
        attribute_prob[attr][event]['yes'] = prob_yes
        attribute_prob[attr][event]['no'] = prob_no

# Visualize Output:
for key1 in attribute_prob:
    print(key1, ":")
    print("----------")
    for key2 in attribute_prob[key1]:
        print("\t", key2, attribute_prob[key1][key2])

outlook :
----------
	 sunny {'yes': 0.3, 'no': 0.5714285714285714}
	 rainy {'yes': 0.4, 'no': 0.2857142857142857}
	 overcast {'yes': 0.3, 'no': 0.14285714285714285}
temp :
----------
	 hot {'yes': 0.2, 'no': 0.42857142857142855}
	 cool {'yes': 0.4, 'no': 0.2857142857142857}
	 mild {'yes': 0.4, 'no': 0.2857142857142857}
humidity :
----------
	 normal {'yes': 0.6666666666666666, 'no': 0.3333333333333333}
	 high {'yes': 0.3333333333333333, 'no': 0.6666666666666666}
windy :
----------
	 False {'yes': 0.6666666666666666, 'no': 0.5}
	 True {'yes': 0.3333333333333333, 'no': 0.5}


### Predict Labels based on Bayes Model

In [68]:
# Compute and return Bayes Theorem result (probability of yes or no)
def predict(test_data, outfile):
    with open(outfile, 'w') as f:
        
        for i in range(len(test_data)):

            prob_yes = target_prob['yes'] #P(y)
            prob_no = target_prob['no'] #P(n)
                        
            for attr in test_data:

                # Get the occurance (ex: attr= 'outlook', event='overcast')
                event = test_data.iloc[i][attr]
                
                # Multiple probabilities
                prob_yes *= attribute_prob[attr][event]['yes'] # P(event| y)
                prob_no *= attribute_prob[attr][event]['no'] # P(event| n)
            
            prediction = "yes" if prob_yes >= prob_no else "no"
            prediction += '\n'
            f.write(prediction)


In [71]:
# Generate predictions and output to output.txt
def naive_bayes(infile):
    test_data = pd.read_csv(infile)
    outfile = "output.txt"
    predict(test_data, outfile)

In [72]:
# Execution
naive_bayes('input.csv')