In [61]:
import sys

sys.path.append("..")

In [62]:
from model_definitions import LDAPolynomial
from dataset.utils import get_cached_data_with_test

In [63]:
x_full, y, x_test_full = get_cached_data_with_test("../artifacts")
i = [101, 102, 103, 105]

x_train = x_full[:, i]
x_test = x_test_full[:, i]

In [70]:
from abc import ABC, abstractmethod
from sklearn.naive_bayes import GaussianNB
import numpy as np

from metrics import calculate_gain


class BaseModel(ABC):
    name: str

    @abstractmethod
    def __init__(self, *args, **kwargs):
        pass

    def preprocess_features(self, x: np.ndarray, is_train: bool) -> np.ndarray:
        """
        Preprocess the features before fitting the model.
        """
        return x

    @abstractmethod
    def _fit(self, x: np.ndarray, y: np.ndarray) -> None:
        """
        Perform fit.
        :param x: features - n x m array, where n is the number of samples
        :param y: target - n element array
        """

    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
        """
        Fit the model to the data.
        :param x: features
        :param y: target
        """
        x_processed = self.preprocess_features(x, is_train=True)
        self._fit(x_processed, y)

    @abstractmethod
    def _predict(self, x: np.ndarray) -> np.ndarray:
        """
        Predict the target for the given features. Ensure target is a 1D array
        representing probability of positive class for each sample.
        :param x: features
        :return: target
        """

    def predict(self, x: np.ndarray) -> np.ndarray:
        """
        Predict the target for the given features.
        :param x: features
        :return: target
        """
        x_processed = self.preprocess_features(x, is_train=False)
        return self._predict(x_processed)

    def calculate_gain(self, x: np.ndarray, y: np.ndarray) -> float:
        """
        Calculate the gain of the predictions compared to the ground truth on given
        data.
        :param x: features
        :param y: target
        :return: gain
        """
        y_pred = self.predict(x)

        return calculate_gain(y, y_pred, x.shape[1])


class NaiveBayes(BaseModel):
    name = "NaiveBayes"

    def __init__(self, var_smoothing: float = 1e-9):
        self.model = GaussianNB(var_smoothing=var_smoothing)

    def _fit(self, x: np.ndarray, y: np.ndarray) -> None:
        self.model.fit(x, y)

    def _predict(self, x: np.ndarray) -> np.ndarray:
        return self.model.predict_proba(x)[:, 1]


model = NaiveBayes()
model.fit(x_train, y)

predictions = model.predict(x_test)

In [71]:
predictions

array([0.30133582, 0.57438052, 0.398232  , ..., 0.61794736, 0.34423665,
       0.34038532])

In [74]:
import pandas as pd
import numpy as np

# Select top 1000 indices
top_indices = np.argsort(predictions)[-1000:] + 1

# Save the indices and features used
student_id = "335724NB"  # replace with the actual student ID
pd.DataFrame(top_indices).to_csv(f"{student_id}_obs.txt", index=False, header=False)
pd.DataFrame({"FeatureIndex": np.array(i) + 1}).to_csv(
    f"{student_id}_vars.txt", index=False, header=False
)

In [75]:
def count_common_elements(file1, file2):
    # Read numbers from the first file and convert them to a set
    with open(file1, "r") as f1:
        numbers1 = set(f1.read().split())

    # Read numbers from the second file and convert them to a set
    with open(file2, "r") as f2:
        numbers2 = set(f2.read().split())

    # Find the intersection of both sets and return the count of common elements
    common_elements = numbers1.intersection(numbers2)
    return len(common_elements)


file1 = "335724_obs.txt"
file2 = "335724SVM_obs.txt"
print("Number of common elements:", count_common_elements(file1, file2))

Number of common elements: 922
