In [1]:
# IMPORTS
import csv
import os 

# FUNCTIONS

In [2]:
def loadCSVFile(filename) :
    filepath = "D:\\Documents\\jupyter" + filename
    features = []
    data_dict = []

    print("Loading CSV File...")
    
    with open(filename, mode='r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        features = csv_reader.fieldnames
        data_dict = [row for row in csv_reader]

    print("CSV File Loaded Successfully.\n")
    print(f"Features: {features}")
    print(f"Class Label: {features[len(features) - 1]}\n")

    return features, data_dict

In [3]:
def helloworld():
    print("hello world")

In [4]:
def bayes_agivenb(a, b, bgivena) :
    return (a * bgivena) / b

In [5]:
def arrayProduct(floatarray) :
    product = 1
    for value in floatarray :
        product *= value
    return product

In [6]:
# ===== class probabilty function =====
# given:
#   class probability, or P(c)
#   conditional probabilities, or P(c|xi) for i = [0,n-1]
#   feature probabilities, or P(xi) for i = [0,n-1]
# parameters:
#   class_probability : float
#   conditional_probabilities : float[n]
#   feature_probabilities : float[n]
def NaiveBayes(class_probability, conditional_probabilities, feature_probabilities) :
    product_conditional = arrayProduct(conditional_probabilities)
    product_features = arrayProduct(feature_probabilities)
    return bayes_agivenb(class_probability, product_features, product_conditional)

In [7]:
# ===== add row to probability matrix =====
# row: must be a dictionary with the following format:
#    feature : value

def addRowProbMatrix(matrix, features, row) :
    feature_count = len(features) - 1 # subtract 1 to ignore class on far right of csv
    rowClass = row[features[feature_count]]  # features[feature_count] is the farthest column, aka class
    
    for index in range(feature_count) :
            feature = features[index]
            value = row[feature]

            # create value entry if not there
            if not (value in matrix[feature]) :
                value_entry_dict = {}
                # include counts for each class
                for data_class in classes :
                    value_entry_dict[data_class] = 0
                # include total
                value_entry_dict["total"] = 0
                # build value entry
                matrix[feature][value] = value_entry_dict
                # print(f"New value entry [{feature} : {value}]")

            # update value entry
            matrix[feature][value][rowClass] += 1
            matrix[feature][value]["total"] += 1
            matrix["%class_counts%"][rowClass] += 1
            matrix["%total%"] += 1

In [8]:
# ===== probability_matrix =====
# feature:
#   value:
#       class1 : count
#       class2 : count
#       "total" : count
# "%class_counts%":
#   class1 : count
#   class2 : count
# ASSUME LAST FEATURE IN DICT IS CLASS
def buildProbabilityMatrix(features, csvdict, classes) :
    print("Building Probability Matrix...")
    matrix = {}
    feature_count = len(features) - 1 # subtract 1 to ignore class on far right of csv

    # add each feature
    for index in range(feature_count) :
        matrix[features[index]] = dict()
    # add class counts
    matrix["%class_counts%"] = {}
    for data_class in classes :
        matrix["%class_counts%"][data_class] = 0
    # add total count
    matrix["%total%"] = 0
    
    # add each value to probability
    for row in csvdict :
        addRowProbMatrix(matrix, features, row) # add row to matrix
    
    print("Matrix Completed")
    return matrix


In [9]:
def ClassifyRow(prob_matrix, features, row, classes) :
    total = prob_matrix["%total%"]
    best_probability = 0
    best_class = ""

    # generate feature probabilites
    feature_probabilities = []
    for index in range(len(features) - 1):
        feature = features[index]
        feature_probabilities.append(prob_matrix[feature][row[feature]]["total"] / total)
    
    for data_class in classes:
        #generate class probability
        class_prob = prob_matrix["%class_counts%"][data_class] / total
        # generate conditional probabilities
        conditional_probabilities = []
        for index in range(len(features) - 1):
            feature = features[index]
            conditional_probabilities.append(prob_matrix[feature][row[feature]][data_class] / prob_matrix[feature][row[feature]]["total"])
        # generate P(C|x)
        p = NaiveBayes(class_prob, conditional_probabilities, feature_probabilities)
        
        print(f"{data_class} : {p}")
        
        if (p > best_probability) :
            best_probability = p
            best_class = data_class
    
    # return chosen class and probability
    return best_class, best_probability

# TESTS / RUNS

In [10]:
helloworld()

hello world


In [11]:
print(bayes_agivenb(0.20, 0.30, 0.02))

0.013333333333333334


In [12]:
# MAKE PROBABILITY MATRIX
features, csvdata = loadCSVFile("matchups-2007-trunc.csv")
classes = ["-1", "1"]

prob_matrix = buildProbabilityMatrix(features, csvdata, classes)

Loading CSV File...
CSV File Loaded Successfully.

Features: ['home_team', 'away_team', 'starting_min', 'end_min', 'home_0', 'home_1', 'home_2', 'home_3', 'home_4', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4', 'fga_home', 'fta_home', 'fgm_home', 'fga_2_home', 'fgm_2_home', 'fga_3_home', 'fgm_3_home', 'ast_home', 'blk_home', 'pf_home', 'reb_home', 'dreb_home', 'oreb_home', 'to_home', 'pts_home', 'pct_home', 'pct_2_home', 'pct_3_home', 'fga_visitor', 'fta_visitor', 'fgm_visitor', 'fga_2_visitor', 'fgm_2_visitor', 'fga_3_visitor', 'fgm_3_visitor', 'ast_visitor', 'blk_visitor', 'pf_visitor', 'reb_visitor', 'dreb_visitor', 'oreb_visitor', 'to_visitor', 'pts_visitor', 'pct_visitor', 'pct_2_visitor', 'pct_3_visitor', 'outcome']
Class Label: outcome

Building Probability Matrix...
Matrix Completed


In [13]:
c1 = prob_matrix["%class_counts%"]["-1"]
c2 = prob_matrix["%class_counts%"]["1"]
total = prob_matrix["%total%"]
print(f"{c1} - {c1/total*100}%")
print(f"{c2} - {c2/total*100}%")
print(total)
print(c1 + c2 == total)

897100 - 65.24363636363636%
477900 - 34.75636363636364%
1375000
True


In [14]:
print(csvdata[5])
print(csvdata[100])

{'home_team': 'LAL', 'away_team': 'PHO', 'starting_min': '12', 'end_min': '12', 'home_0': 'Brian Cook', 'home_1': 'Maurice Evans', 'home_2': 'Sasha Vujacic', 'home_3': 'Smush Parker', 'home_4': 'Vladimir Radmanovic', 'away_0': 'Boris Diaw', 'away_1': 'James Jones', 'away_2': 'Kurt Thomas', 'away_3': 'Leandro Barbosa', 'away_4': 'Marcus Banks', 'fga_home': '1', 'fta_home': '0', 'fgm_home': '1', 'fga_2_home': '0', 'fgm_2_home': '0', 'fga_3_home': '1', 'fgm_3_home': '1', 'ast_home': '1', 'blk_home': '0', 'pf_home': '0', 'reb_home': '1', 'dreb_home': '1', 'oreb_home': '0', 'to_home': '0', 'pts_home': '3', 'pct_home': '1', 'pct_2_home': '0', 'pct_3_home': '1', 'fga_visitor': '2', 'fta_visitor': '0', 'fgm_visitor': '1', 'fga_2_visitor': '2', 'fgm_2_visitor': '1', 'fga_3_visitor': '0', 'fgm_3_visitor': '0', 'ast_visitor': '1', 'blk_visitor': '0', 'pf_visitor': '1', 'reb_visitor': '0', 'dreb_visitor': '0', 'oreb_visitor': '0', 'to_visitor': '0', 'pts_visitor': '2', 'pct_visitor': '0.5', 'pct_2

In [15]:
new_row = {'home_team': 'SAS', 'away_team': 'PHO', 'starting_min': '12', 'end_min': '12', 'home_0': 'Brian Cook', 'home_1': 'Maurice Evans', 'home_2': 'Sasha Vujacic', 'home_3': 'Smush Parker', 'home_4': 'Vladimir Radmanovic', 'away_0': 'Boris Diaw', 'away_1': 'James Jones', 'away_2': 'Kurt Thomas', 'away_3': 'Leandro Barbosa', 'away_4': 'Marcus Banks', 'fga_home': '1', 'fta_home': '0', 'fgm_home': '2', 'fga_2_home': '3', 'fgm_2_home': '0', 'fga_3_home': '1', 'fgm_3_home': '1', 'ast_home': '1', 'blk_home': '0', 'pf_home': '0', 'reb_home': '1', 'dreb_home': '1', 'oreb_home': '0', 'to_home': '0', 'pts_home': '3', 'pct_home': '1', 'pct_2_home': '0', 'pct_3_home': '1', 'fga_visitor': '2', 'fta_visitor': '0', 'fgm_visitor': '1', 'fga_2_visitor': '2', 'fgm_2_visitor': '1', 'fga_3_visitor': '0', 'fgm_3_visitor': '0', 'ast_visitor': '1', 'blk_visitor': '0', 'pf_visitor': '1', 'reb_visitor': '0', 'dreb_visitor': '0', 'oreb_visitor': '0', 'to_visitor': '0', 'pts_visitor': '2', 'pct_visitor': '0.5', 'pct_2_visitor': '0.5', 'pct_3_visitor': '0'}

c, p = ClassifyRow(prob_matrix, features, new_row, classes)
print(f"Predicted Class: {c}")
print(f"Prediction Probability: {p}")

-1 : 1.0625435295244503e+124
1 : 6.05721822932815e+110
Predicted Class: -1
Prediction Probability: 1.0625435295244503e+124
