In [2]:
import numpy as np


def calculate_gain(
    ground_truth: np.ndarray, predictions: np.ndarray, features_count: int
) -> float:
    """
    Calculate the gain of the predictions compared to the ground truth.
    """
    preds_to_pick = int(ground_truth.shape[0] * 0.2)
    sorted_preds = np.argsort(predictions)[::-1][:preds_to_pick]
    return (
        np.sum(ground_truth[sorted_preds]) / preds_to_pick * 10_000
        - features_count * 200
    )


In [3]:
from abc import ABC, abstractmethod

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression


class BaseModel(ABC):
    name: str

    @abstractmethod
    def __init__(self, *args, **kwargs):
        pass

    def preprocess_features(self, x: np.ndarray, is_train: bool) -> np.ndarray:
        """
        Preprocess the features before fitting the model.
        """
        return x

    @abstractmethod
    def _fit(self, x: np.ndarray, y: np.ndarray) -> None:
        """
        Perform fit.
        :param x: features - n x m array, where n is the number of samples
        :param y: target - n element array
        """

    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
        """
        Fit the model to the data.
        :param x: features
        :param y: target
        """
        x_processed = self.preprocess_features(x, is_train=True)
        self._fit(x_processed, y)

    @abstractmethod
    def _predict(self, x: np.ndarray) -> np.ndarray:
        """
        Predict the target for the given features. Ensure target is a 1D array
        representing probability of positive class for each sample.
        :param x: features
        :return: target
        """

    def predict(self, x: np.ndarray) -> np.ndarray:
        """
        Predict the target for the given features.
        :param x: features
        :return: target
        """
        x_processed = self.preprocess_features(x, is_train=False)
        return self._predict(x_processed)

    def calculate_gain(self, x: np.ndarray, y: np.ndarray) -> float:
        """
        Calculate the gain of the predictions compared to the ground truth on given
        data.
        :param x: features
        :param y: target
        :return: gain
        """
        y_pred = self.predict(x)

        return calculate_gain(y, y_pred, x.shape[1])

class LDA(BaseModel):
    name = "LDA"

    def __init__(self):
        self.model = LinearDiscriminantAnalysis()

    def _fit(self, x: np.ndarray, y: np.ndarray) -> None:
        self.model.fit(x, y)

    def _predict(self, x: np.ndarray) -> np.ndarray:
        return self.model.predict_proba(x)[:, 1]


class LDAPolynomial(LDA):
    name = "LDAPolynomial"

    def __init__(self, degree: int = 2, interactions_only: bool = False):
        self.poly = PolynomialFeatures(
            degree=degree, interaction_only=interactions_only
        )
        super().__init__()

    def preprocess_features(self, x: np.ndarray, is_train: bool) -> np.ndarray:
        """
        Convert features to polynomial features.
        """
        if is_train:
            self.poly.fit(x)
        return self.poly.transform(x)

In [4]:
import pandas as pd
x_train = pd.read_csv('x_train.txt', header=None, delim_whitespace=True).values
y_train = pd.read_csv('y_train.txt', header=None)[0].values
x_test = pd.read_csv('x_test.txt', header=None, delim_whitespace=True).values

model = LDAPolynomial(degree=2, interactions_only=False)
x_train_poly = model.preprocess_features(x_train, is_train=True)
model._fit(x_train_poly, y_train)

x_test_poly = model.preprocess_features(x_test, is_train=False)
predictions = model._predict(x_test_poly)

In [None]:
# Select top 1000 indices
top_indices = np.argsort(predictions)[-1000:]

# Save the indices and features used
student_id = "335724999"  # replace with the actual student ID
pd.DataFrame(top_indices).to_csv(f'{student_id}_obs.txt', index=False, header=False)
pd.DataFrame({'FeatureIndex': [101, 102, 103, 105]}).to_csv(f'{student_id}_vars.txt', index=False, header=False)