In [64]:
#!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m358.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:21[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m320.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=9d441740999c28ad01f6e075de071df3129d0117f5e83bb7e89c3c74393178ae
  Stored in directory: /home/donia/.cache/pip/wheels/da/78/6d/54350e0243f65f77dccf6ebe2ed5559faf6900559e904fb957
Successfully built pyspark
Installing collecte

In [2]:
import os
import pandas as pd
from sklearn.model_selection import KFold
import math
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler

**Load The Data**

In [3]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data.csv')

preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

In [4]:
x_train = preprocessed_train_data.drop(["satisfaction"], axis = 1)
y_train = preprocessed_train_data["satisfaction"]

x_test = preprocessed_test_data.drop(["satisfaction"], axis = 1)
y_test = preprocessed_test_data["satisfaction"]

#y_test = y_test.replace({"satisfied":1, "neutral or dissatisfied":0})

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

**Scale The Data**

In [5]:
# Step 3: Scale the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(preprocessed_train_data)
test_scaled = scaler.transform(preprocessed_test_data)

# Convert scaled arrays back to DataFrame
train_scaled_df = pd.DataFrame(train_scaled, columns=preprocessed_train_data.columns)
test_scaled_df = pd.DataFrame(test_scaled, columns=preprocessed_test_data.columns)

**Prior Probabilities**

Prior Probability mapper

In [6]:
def mapper_prior_probability(data):
    class_counts = {}

    for index, row in data.iterrows():
        label = row["satisfaction"]
        if label not in class_counts:
            class_counts[label] = 1
        else:
            class_counts[label] += 1

    return [(class_counts, {})]

Prior Probability reducer

In [7]:
def reducer_prior_probability(mapped_results):
    class_counts_total = {}

    for c_count, _ in mapped_results:
        for label, count in c_count.items():
            if label not in class_counts_total:
                class_counts_total[label] = count
            else:
                class_counts_total[label] += count

    total_samples = sum(class_counts_total.values())
    class_probabilities = {label: count / total_samples for label, count in class_counts_total.items()}

    return class_probabilities

**Features Count**

features count mapper

In [8]:
'''
The mapper iterates over each row in the training data 
and counts occurrences of each feature-value pair for each class.
'''
def mapper(data):
    feature_counts = {}

    for index, row in data.iterrows():
        label = row["satisfaction"]
        for feature_name, feature_value in row.items():
            if feature_name != "satisfaction":
                key = (label, feature_name, feature_value)
                if key not in feature_counts:
                    feature_counts[key] = 1
                else:
                    feature_counts[key] += 1
                    
    return [feature_counts]

features count Reducer

In [9]:
'''
aggregates the counts from all mappers.
sum up the counts for each class and each feature-value pair.
'''
def reducer(mapped_results):
    '''
    the outer keys represent classes, the inner keys represent features, and the innermost keys represent feature values. 
    '''
    feature_counts = {}

    for f_count in mapped_results:
        for key, count in f_count.items():
            if key[0] not in feature_counts:
                feature_counts[key[0]] = {}
            if key[1] not in feature_counts[key[0]]:
                feature_counts[key[0]][key[1]] = {}
            feature_counts[key[0]][key[1]][key[2]] = count
            
    return feature_counts

**Train Naive Bayes Classifier**

In [10]:
def mapper_train_naive_bayes_feature_probabilities(feature_counts):
    '''
    It transforms the feature_counts dictionary into a list of tuples, 
    where each tuple contains a class label and the associated features.
    '''
    partial_feature_probabilities = []

    for label, features in feature_counts.items():
        partial_feature_probabilities.append((label, features))

    return partial_feature_probabilities

In [11]:
def reducer_train_naive_bayes_feature_probabilities(mapped_results):
    feature_probabilities = {}

    for label, features in mapped_results:
        # Calculate feature probabilities
        if label not in feature_probabilities:
            feature_probabilities[label] = {}
        for feature, values in features.items():
            total_feature_count = sum(values.values())
            if feature not in feature_probabilities[label]:
                feature_probabilities[label][feature] = {}
            for value, count in values.items():
                if value not in feature_probabilities[label][feature]:
                    feature_probabilities[label][feature][value] = count / total_feature_count
                else:
                    feature_probabilities[label][feature][value] += count / total_feature_count

    return feature_probabilities

**Make Predictions**

In [12]:
'''
calculates the probability of each class for a given sample 
'''
def mapper_predict_naive_bayes(class_probabilities, feature_probabilities, sample):
    log_probs = {}

    for label, class_prob in class_probabilities.items():
        log_prob = math.log(class_prob)
        for feature, value in sample.items():
            if feature in feature_probabilities[label] and value in feature_probabilities[label][feature]:
                log_prob += math.log(feature_probabilities[label][feature][value])
            else:
                # Handle unseen feature values
                log_prob += 0
        
        log_probs[label] = log_prob

    return log_probs

In [13]:
'''
selects the class with the highest probability as the predicted class. 
'''
def reducer_predict_naive_bayes(mapped_results):
    predicted_class = None
    max_log_prob = float('-inf')

    for label, log_prob in mapped_results.items():
        if log_prob > max_log_prob:
            max_log_prob = log_prob
            predicted_class = label

    return predicted_class

**Apply Map Reducer**

In [14]:
# Mapping for Prior Probability
mapped_results_prior = mapper_prior_probability(train_scaled_df)
class_probabilities_prior = reducer_prior_probability(mapped_results_prior)

# Mapping for Feature Counts
mapped_results_feature_counts = mapper(train_scaled_df)
final_feature_counts = reducer(mapped_results_feature_counts)

# Apply map-reduce for Training Naive Bayes Classifier (Feature Probabilities)
mapped_results_train_nb = mapper_train_naive_bayes_feature_probabilities(final_feature_counts)
feature_probabilities = reducer_train_naive_bayes_feature_probabilities(mapped_results_train_nb)

# Make predictions
predictions = []
for index, row in test_scaled_df.iterrows():
    sample = row.drop("satisfaction").to_dict()
    log_probs = mapper_predict_naive_bayes(class_probabilities_prior, feature_probabilities, sample)
    predicted_class = reducer_predict_naive_bayes(log_probs)
    predictions.append(predicted_class)


# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
auc_score = roc_auc_score(y_test, predictions)


# Make predictions on the training dataset
train_predictions = []
for index, row in train_scaled_df.iterrows():
    sample = row.drop("satisfaction").to_dict()
    log_probs = mapper_predict_naive_bayes(class_probabilities_prior, feature_probabilities, sample)
    predicted_class = reducer_predict_naive_bayes(log_probs)
    train_predictions.append(predicted_class)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, train_predictions)
print("Balanced Accuracy: ", auc_score)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", accuracy)
print("f1_score:", f1)
print("precision:", precision)
print("recall:", recall)

Balanced Accuracy:  0.7650422898817919
Training Accuracy: 0.8885220973206036
Testing Accuracy: 0.7626516019436653
f1_score: 0.7399809573271012
precision: 0.7018307199737296
recall: 0.7825171624713959
