The idea is to model the probability distributions of the significant features and use a bayesian approach for classification.
To model the probability distributions we'll use:
 - A multivariate 2 component gaussian mixture model for " Sensor_beta", "Sensor_gamma" and "Sensor_alpha_plus".
 - A piecewise constant distribution for "hour".

The 2 distributions will be considered independent

In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.mixture import GaussianMixture

from piecewise_constant import OuterInnerConstantDistribution
from bayes_classifier import BayesClassifier
from joint_distribution import JointDistribution

In [2]:
data_path = "train.csv"
data = pd.read_csv(data_path)

X = data
y = X.pop("Insect")

X_train, X_val, y_train, y_val = train_test_split(X, y)
print(f"train samples: {len(X_train)}, validation samples: {len(X_val)}")

train samples: 5250, validation samples: 1751


In [3]:
class InsectClassifier(BayesClassifier):
    def __init__(self):
        time_models = [
            OuterInnerConstantDistribution(1, 24, 18, 23),
            OuterInnerConstantDistribution(1, 24, 21, 22),
            OuterInnerConstantDistribution(1, 24, 18, 24)
        ]

        sensor_features = ["Sensor_beta", "Sensor_gamma", "Sensor_alpha_plus"]
        sensor_models = [
            GaussianMixture(
                n_components=2,
                n_init=30,
                tol=1e-4,
            )
        for _ in range(3)]

        likelihood_models = []
        for insect in (0, 1, 2):
            joint_likelihood_model = JointDistribution()
            joint_likelihood_model.add_model(time_models[insect], ["Hour"])
            joint_likelihood_model.add_model(sensor_models[insect], sensor_features)
            likelihood_models.append(joint_likelihood_model)

        super().__init__(likelihood_models)

In [4]:
def evaluate_classifier(model, X, y):
    pred = model.predict(X)
    print(classification_report(pred, y))

In [5]:
classifier = InsectClassifier()
classifier.fit(X_train, y_train)

evaluate_classifier(classifier, X_val, y_val)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       904
           1       0.98      0.95      0.96       702
           2       0.71      0.88      0.78       145

    accuracy                           0.94      1751
   macro avg       0.88      0.92      0.90      1751
weighted avg       0.94      0.94      0.94      1751



Train the classifier on all the available data and save to disk

In [7]:
final_classifier = InsectClassifier()
final_classifier.fit(X, y)

evaluate_classifier(final_classifier, X, y)

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      3603
           1       0.98      0.96      0.97      2852
           2       0.71      0.89      0.79       546

    accuracy                           0.94      7001
   macro avg       0.88      0.93      0.90      7001
weighted avg       0.95      0.94      0.94      7001



In [8]:
joblib.dump(final_classifier, "classifier.joblib")

['classifier.joblib']