In [1]:
import numpy as np
import pandas as pd
import math

# Load training data
train_data = pd.read_csv('train.csv')

# Load test data
test_data = pd.read_csv('test.csv')

# Define a mapping from Dtest columns to Dtrain columns
feature_mapping = {
    'Verb': 'Var1',
    'Noun': 'Var2',
    'Prep': 'Var3',
    'Prep_obj': 'Var4'
}

# Function to compute conditional probabilities using MLE and smoothing
def compute_probabilities(data, class_label):
    probabilities = {}
    for feature in data.columns[:-1]:  # Exclude the class label column
        feature_values = set(data[feature].unique())  # Unique values for the feature
        probabilities[feature] = {}
        for label in data[class_label].unique():
            probabilities[feature][label] = {val: 0 for val in feature_values}  # Initialize with zeros
            label_count = data[data[class_label] == label].shape[0]
            for attribute in feature_values:
                count_attribute_label = data[(data[feature] == attribute) & (data[class_label] == label)].shape[0]
                probabilities[feature][label][attribute] = (count_attribute_label + 1) / (label_count + len(feature_values))
    return probabilities

# Train the model and save probabilities to a file
model = compute_probabilities(train_data, 'Var5')
np.save('model.npy', model)

# Function to compute corpus cross entropy
def compute_cross_entropy(instance, probabilities, label, feature_mapping):
    entropy = 0
    for test_feature, train_feature in feature_mapping.items():
        attribute_value = instance[test_feature]
        # Check if attribute value exists in the model, if not apply smoothing
        if attribute_value not in probabilities[train_feature][label]:
            probabilities[train_feature][label][attribute_value] = 1 / (len(probabilities[train_feature][label]) + 1)
        entropy -= math.log2(probabilities[train_feature][label][attribute_value])
    return entropy

# Load model from file
loaded_model = np.load('model.npy', allow_pickle=True).item()

# Testing procedures
accurate_predictions = 0
for index, instance in test_data.iterrows():
    entropies = {}
    for label in ['V', 'N']:  # Class labels
        entropies[label] = compute_cross_entropy(instance, loaded_model, label, feature_mapping)
    assigned_label = min(entropies, key=entropies.get)
    if assigned_label == instance['Class_label']:
        accurate_predictions += 1

# Calculate accuracy
accuracy = accurate_predictions / len(test_data)
print("Accuracy:", accuracy*100)

Accuracy: 78.60000000000001
