### Prove of concept: we should be able to build an almost perfect prime model on modular features 
**modular feature**: is a number dividable by a given prime <br>
target whether int is/ isnt prime is almost a simple linear combination of features

In [None]:
# imports
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd


# sklearn imports
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
# params
prime_lim = 500000

#### Build Core Dataset

In [None]:
# read in prime numbers
primes = np.load(f'../../artifacts/primes/prime_{prime_lim}.npy')
primes[:100]

In [None]:
# convert to natural numbers with binary target
natural_numbers = np.arange(0,prime_lim)
target = np.zeros(prime_lim, dtype=bool)
target[primes] = True

In [None]:
data = pd.DataFrame(data={'n': natural_numbers[2:], 'y': target[2:]})
data.head()

### A: try out small data sets with modular features
--> if we can actually almost garantie a modular signal per prime in training, the models should show almost perfect performance

models: 
- prime cutoff 10000, and the lower 100 primes are used for features --> converges
- prime cutoff 100000, and the lower 100 primes are used for features --> converges
- prime cutoff 500000, and the lower 100 primes are used for features --> converges

In [None]:
model_dict = {
    10000: {},
    100000: {},
    prime_lim: {},
} # prime_cutoff as key for models

n_modular_features = 100 # not all features
target_col = 'y'


for prime_cutoff in model_dict.keys():
    print(prime_cutoff,'\n')
    
    data_a = data[data['n']<prime_cutoff].copy()
    print(data_a.shape)
    
    # create modular features
    features = [data_a['n'].apply(lambda x: 1 if (x%prime==0 and x!=prime) else 0).values for prime in primes[:n_modular_features]]
    features = np.array(features).T
    feature_col = [f"mod_{str(prime)}" for prime in primes[:n_modular_features]]

    data_a = pd.concat([data_a, pd.DataFrame(features, columns=feature_col)], axis=1)

    print(data_a.head())

    # split in train & test    
    X, y = data_a[feature_col], data_a[target_col]
    print(target_col in feature_col)
    
    X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # train logistic regression as start
    # lbfgs solver, l2 penalty
    clf = LogisticRegressionCV(cv=10, random_state=0, max_iter=500).fit(X_train, y_train)
    
    # store models and data
    model_dict[prime_cutoff]['data'] = data_a.copy()
    model_dict[prime_cutoff]['model'] = clf
    
    model_dict[prime_cutoff]['X_train'] = X_train.copy()
    model_dict[prime_cutoff]['X_test'] = X_test.copy()
    model_dict[prime_cutoff]['y_train'] = y_train.copy()
    model_dict[prime_cutoff]['y_test'] = y_test.copy()

    print('Training completed')


In [None]:
# create predictions for evaluation of models

for prime_cutoff in model_dict.keys():
    curmod = model_dict[prime_cutoff]
    curmod['y_pred'] = curmod['model'].predict(curmod['X_test'])


#### Check overall performance of models

In [None]:
for prime_cutoff in model_dict.keys():
    print(f'model with prime cutoff {prime_cutoff}')
    curmod = model_dict[prime_cutoff]
    print('confusion matrix \n', confusion_matrix(curmod['y_test'], curmod['y_pred']), '\n')

    print(classification_report(curmod['y_test'], curmod['y_pred']))


very few misclassification in every model

#### Confusion matrix depending on signal in modular features for models
**When do we have misclassifications?** <br>
-> all false positives (not prime, but predicted as prime) should have no signal in modular features (like no single modular feature = 1) <br>
-> all false negatives (prime, but not predicted as prime) cannot have any modular signal (as they are not prime) -> so how does this misclassification happen? <br>

<br> 
- ideally, the model would perfectly learn that no modular signal = prime -> that would eliminate all false negatives <br>
- introducing other features than just modular features would help to reduce the false positives

In [None]:
for i, prime_cutoff in enumerate(model_dict.keys()):
    fig, ax = plt.subplots(1,2, figsize=(9, 3.5))

    curmod = model_dict[prime_cutoff]

    mod_features = curmod['model'].feature_names_in_

    # add new superposition of modular features to dataframe
    curmod['X_test']['any_mod'] = curmod['X_test'][mod_features].aggregate('sum',axis=1)>0

    # confusion matrix with any modular features
    cm_mod = confusion_matrix(curmod['y_test'][curmod['X_test']['any_mod']], curmod['y_pred'][curmod['X_test']['any_mod']], labels=curmod['model'].classes_)

    disp = ConfusionMatrixDisplay(confusion_matrix=cm_mod,
                              display_labels=curmod['model'].classes_)

    disp.plot(ax=ax[0])
    
    # confusion matrix without any modular features
    cm_nonmod = confusion_matrix(curmod['y_test'][curmod['X_test']['any_mod']==False], curmod['y_pred'][curmod['X_test']['any_mod']==False], labels=curmod['model'].classes_)


    disp = ConfusionMatrixDisplay(confusion_matrix=cm_nonmod,
                                  display_labels=curmod['model'].classes_)
    disp.plot(ax=ax[1])

    ax[0].set_title('Modular features = 1', size=10)
    ax[1].set_title('Modular features = 0', size=10)

    plt.suptitle(f"Model with prime cutoff {prime_cutoff}")
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    plt.show()


- first model has perfect classification
- second model classifies some primes with instead of modular features, but no modular feature = prime (and is true in test set)
- third model correctly classifies any number with any modular signal as "not prime" but misclassifies all which are prime although there is no modular signal
  --> we now have to find other features which might help with this false positive group

### B: lets try to move away from modular features
- modular features are trivial, because if we provide them all, the recognition of "prime / no prime" is a simple linear superposition <br>
- lets try to find other features and reduce modular features

models:
- prime cutoff 500000, and the lower 50 primes are used for features, no other features -> converges
- prime cutoff 500000, and the lower 50 primes are used for features, some other normalized features added
  -> converges
  -> do the new features help AT ALL in reducing false positives?


In [None]:
n_modular_features = 50 # not all features
target_col = 'y'

data_b = data[data['n']<prime_cutoff].copy()

In [None]:
# create modular features
features = [data_b['n'].apply(lambda x: 1 if (x%prime==0 and x!=prime) else 0).values for prime in primes[:n_modular_features]]
features = np.array(features).T
feature_col = [f"mod_{str(prime)}" for prime in primes[:n_modular_features]]

data_b = pd.concat([data_b, pd.DataFrame(features, columns=feature_col)], axis=1)

# print(data_b.head())

In [None]:
# non-modular features

data_b['n+1'] = data_b['n'].apply(lambda x: x+1)
data_b['n-1'] = data_b['n'].apply(lambda x: x-1)
data_b['2n'] = data_b['n'].apply(lambda x: x*2)
data_b['n**2'] = data_b['n'].apply(lambda x: x**2)
data_b['n%2'] = data_b['n'].apply(lambda x: x%2) # this might be too strong as an indicator?

# distance to last prime?
# number of primes before this number
# dividing current number by last prime? 
# what is last prime?

data_b['last_prime']=data_b['n'].apply(lambda x: primes[primes<x].max() if x!=2 else -1)
data_b['primes_lower_n']=data_b['n'].apply(lambda x: len(primes[primes<x]) if x!=2 else 0)
data_b['n_div_last_prime']=data_b.apply(lambda x: x['n']/x['last_prime'] if x['n']!=2 else -1, axis=1)
data_b['n_minus_last_prime']=data_b.apply(lambda x: x['n']-x['last_prime'] if x['n']!=2 else -1, axis=1)


In [None]:
feature_col = data_b.columns.drop(target_col)
print(target_col in feature_col)


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_b[feature_col] = scaler.fit_transform(data_b[feature_col])

In [None]:
# split in train & test    
X, y = data_b[feature_col], data_b[target_col]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.33, random_state=42)

# train logistic regression as start
# lbfgs solver, l2 penalty
clf = LogisticRegressionCV(cv=10, random_state=0, max_iter=500).fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# coefs != feature importance...
cm.coef_