In [19]:
import os
import pandas as pd
from sklearn.model_selection import KFold
import math
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score




**Load The Data**

In [22]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data.csv')

preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

In [23]:
x_train = preprocessed_train_data.drop(["satisfaction"], axis = 1)
y_train = preprocessed_train_data["satisfaction"]

x_test = preprocessed_test_data.drop(["satisfaction"], axis = 1)
y_test = preprocessed_test_data["satisfaction"]

#y_test = y_test.replace({"satisfied":1, "neutral or dissatisfied":0})

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

**Mapper**

In [35]:
# Step 1: Mapping
'''
iterates over each row in the training data and counts occurrences of each class and each feature-value pair.
'''
def mapper(data):
    class_counts = {}
    feature_counts = {}

    for index, row in data.iterrows():
        label = row["satisfaction"]
        if label not in class_counts:
            class_counts[label] = 1
        else:
            class_counts[label] += 1
        for feature_name, feature_value in row.items():
            if feature_name != "satisfaction":
                key = (label, feature_name, feature_value)
                if key not in feature_counts:
                    feature_counts[key] = 1
                else:
                    feature_counts[key] += 1
                    
    return [(class_counts, feature_counts)]

**Reducer**

In [36]:
# Step 2: Reducing
'''
aggregates the counts from all mappers.
sum up the counts for each class and each feature-value pair.
'''
def reducer(mapped_results):
    class_counts = {}
    feature_counts = {}

    for c_count, f_count in mapped_results:
        for label, count in c_count.items():
            if label not in class_counts:
                class_counts[label] = count
            else:
                class_counts[label] += count
            
        for key, count in f_count.items():
            if key[0] not in feature_counts:
                feature_counts[key[0]] = {}
            if key[1] not in feature_counts[key[0]]:
                feature_counts[key[0]][key[1]] = {}
            feature_counts[key[0]][key[1]][key[2]] = count
            
    return (class_counts, feature_counts)

**Train Naive Bayes Classifier**

In [26]:
# Step 3: Train the Naive Bayes classifier
'''
calculates probabilities based on the counts obtained from the mapper and reducer.
calculates class probabilities and feature probabilities for each class and each feature-value pair.
'''
def train_naive_bayes(class_counts, feature_counts):
    class_probabilities = {}
    feature_probabilities = {}

    total_samples = sum(class_counts.values())

    # Calculate class probabilities
    for label, count in class_counts.items():
        class_probabilities[label] = count / total_samples

    # Calculate feature probabilities
    for label, features in feature_counts.items():
        feature_probabilities[label] = {}
        for feature, values in features.items():
            total_feature_count = sum(values.values())
            feature_probabilities[label][feature] = {value: count / total_feature_count for value, count in values.items()}

    return class_probabilities, feature_probabilities

**Make Predictions**

In [27]:
# Step 4: Make predictions
'''
The prediction function calculates the probability of each class for a given sample 
sselects the class with the highest probability as the predicted class. 
'''
def predict_naive_bayes(class_probabilities, feature_probabilities, sample, var_smoothing=1e-9):
    predicted_class = None
    max_log_prob = float('-inf')

    for label, class_prob in class_probabilities.items():
        log_prob = math.log(class_prob)
        for feature, value in sample.items():
            if feature in feature_probabilities[label] and value in feature_probabilities[label][feature]:
                log_prob += math.log(feature_probabilities[label][feature][value] + var_smoothing)
            else:
                # Apply smoothing for unseen feature values
                log_prob += math.log(var_smoothing)

        if log_prob > max_log_prob:
            max_log_prob = log_prob
            predicted_class = label

    return predicted_class


**Apply Map Reducer**

In [37]:
# Step 5: Apply map-reduce
mapped_results = mapper(preprocessed_train_data)
final_class_counts, final_feature_counts = reducer(mapped_results)

# Step 6: Train the Naive Bayes classifier
class_probabilities, feature_probabilities = train_naive_bayes(final_class_counts, final_feature_counts)

# Step 7: Make predictions
predictions = []
for index, row in preprocessed_test_data.iterrows():
    sample = row.drop("satisfaction").to_dict()
    predicted_class = predict_naive_bayes(class_probabilities, feature_probabilities, sample)
    predictions.append(predicted_class)

# Step 8: Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print("Accuracy:", accuracy)
print("f1_score:", f1)
print("precision:", precision)
print("recall:", recall)


Accuracy: 0.7626516019436653
f1_score: 0.7399809573271012
precision: 0.7018307199737296
recall: 0.7825171624713959
