In [None]:
import pandas as pd
import numpy as np
import errorlda
import random
import matplotlib.pyplot as plt

m0 = np.array([0,1])
m1 = np.array([1,0])
Sigma = np.array([[1,0.5],[0.5,1]])

error_scaler = np.array([[0.9,0],[0,0.6]])

n_samples = 10000

#def get_random_matrix():
#    L = np.zeros(shape=(2,2))
#    L[1,0] = np.random.randn()
#    L[0,0] = np.exp(np.random.randn())
#    L[1,1] = np.exp(np.random.randn())
#
#    return np.matmul(L, L.T) * 0.1

def get_random_diagonal_matrix():
    return np.array([[np.random.uniform(0,2), 0], [0, np.random.uniform(0,2)]])

X_errors = np.zeros(shape=(n_samples, 2, 2))
X = np.zeros(shape=(n_samples, 4)) # First two cols are ACTUAL x,y. Next two are with measurement error.
y = np.zeros(shape=n_samples)

for i in range(0, n_samples):
    # Split between two classes. Uneven distribution
    y[i] = 1 if random.randint(1,10) <= 7 else 0
    
    # First, without measurement error
    X[i,0:2] = m1 if y[i] == 1 else m0
    X[i,0:2] += np.random.multivariate_normal(np.zeros(shape=2), Sigma)

    # Now, with measurement error
    X_errors[i] = get_random_diagonal_matrix() # Or a completely random matrix if you're not multiplying by a diagonal.
    X[i,2:4] = X[i,0:2] + np.random.multivariate_normal(np.zeros(shape=2), np.matmul(X_errors[i], error_scaler))

df_X = pd.DataFrame(X, columns=['x1', 'x2', 'x1_err', 'x2_err'])
df_y = pd.Series(y)

plt.scatter(df_X[df_y == 0]['x1_err'], df_X[df_y == 0]['x2_err'], label=0)
plt.scatter(df_X[df_y == 1]['x1_err'], df_X[df_y == 1]['x2_err'], label=1)
plt.legend()

In [None]:
from sklearn.model_selection import train_test_split

# Note that (currently) ErrorLDA expects pretty much everything to be dataframes and series-es
X_train, X_test, X_train_errors, X_test_errors, y_train, y_test = train_test_split(df_X, pd.Series(list(X_errors)), df_y, stratify=df_y)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

model = LogisticRegression(penalty=None)
model.fit(X_train[['x1_err', 'x2_err']], y_train)

lr_accuracy = accuracy_score(y_test, model.predict(X_test[['x1_err', 'x2_err']]))
lr_log_loss = log_loss(y_test, model.predict_proba(X_test[['x1_err', 'x2_err']]))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, log_loss

# Need a solver that explicitly computes the covariance matrix
model = LinearDiscriminantAnalysis(solver='eigen')
model.fit(X_train[['x1_err', 'x2_err']], y_train)

print("Estimated means:")
print(model.means_)

print()

print("Estimated covariance:")
print(model.covariance_)

print()

lda_accuracy = accuracy_score(y_test, model.predict(X_test[['x1_err', 'x2_err']]))
lda_log_loss = log_loss(y_test, model.predict_proba(X_test[['x1_err', 'x2_err']]))

In [None]:
import errorlda
import importlib

importlib.reload(errorlda)

model = errorlda.ErrorLDA()
model.fit(X_train[['x1_err', 'x2_err']], y_train, X_train_errors=X_train_errors, error_scaling=True)

print()

print("Estimated means:")
for c in model.outcomes:
    print(model.means[c])

y_prob = model.predict_proba(X_test[['x1_err', 'x2_err']], X_error=X_test_errors)
y_pred = (y_prob[1.0] >= 0.5).astype(float)

errorlda_accuracy = accuracy_score(y_test, y_pred)
errorlda_log_loss = log_loss(y_test, y_prob)

In [None]:
# array([0.88411331, 0.16207463])
model.error_scaler = np.array([[0.9, 0.0], [0.0, 0.6]])

y_prob = model.predict_proba(X_test[['x1_err', 'x2_err']], X_error=X_test_errors)
y_pred = (y_prob[1.0] >= 0.5).astype(float)

accuracy_score(y_test, y_pred), log_loss(y_test, y_prob)

In [None]:
results_df = pd.DataFrame([[lr_accuracy, lda_accuracy, errorlda_accuracy],
                           [lr_log_loss, lda_log_loss, errorlda_log_loss]],
                           columns=['LR', 'LDA', 'ErrorLDA'],
                           index=['accuracy', 'log_loss'])

results_df