### Is it possible to build a classification model for detecting prime numbers?

In [None]:
# imports
import numpy as np
#import matplotlib as plt
import pandas as pd

In [None]:
# params
prime_lim = 500000

In [None]:
# read in prime numbers
primes = np.load(f'../../artifacts/primes/prime_{prime_lim}.npy')
primes[:100]

In [None]:
# convert to natural numbers with binary target
natural_numbers = np.arange(0,prime_lim)
target = np.zeros(prime_lim, dtype=bool)
target[primes] = True

In [None]:
data = pd.DataFrame(data={'n': natural_numbers[2:], 'y': target[2:]})

### Features

what kind of features could we have?
- n+1, n-1, ... we can extend this a lot
- 2n
- n*2
- n/2

right now, idea would be to have every row independent from each other 
-> model is not supposed to actually compute the prime numbers (although it would be interesting to know whether it could)

In [None]:
data['n+1'] = data['n'].apply(lambda x: x+1)
data['n-1'] = data['n'].apply(lambda x: x-1)
data['2n'] = data['n'].apply(lambda x: x*2)
data['n**2'] = data['n'].apply(lambda x: x**2)
data['n%2'] = data['n'].apply(lambda x: x%2) # this might be too strong as an indicator?

In [None]:
# distance to last prime?
# number of primes before this number
# dividing current number by last prime? 
# what is last prime?

data['last_prime']=data['n'].apply(lambda x: primes[primes<x].max() if x!=2 else -1)
data['primes_lower_n']=data['n'].apply(lambda x: len(primes[primes<x]) if x!=2 else 0)
data['n_div_last_prime']=data.apply(lambda x: x['n']/x['last_prime'] if x['n']!=2 else -1, axis=1)
data['n_minus_last_prime']=data.apply(lambda x: x['n']-x['last_prime'] if x['n']!=2 else -1, axis=1)


# I could also try out different mod, like n/int((n/3)), is this different from n/3?

In [None]:
data.head(15)

### First Model: Logistic Regression

In [None]:
feature_col = data.columns
target_col = 'y'
feature_col = feature_col.drop(target_col)


In [None]:
# thats obviously crucial if you want the model to converge...

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[feature_col] = scaler.fit_transform(data[feature_col])
           

In [None]:
X, y = data[feature_col], data[target_col]


In [None]:
# divide in train and test randomly

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# train logistic regression as start

from sklearn.linear_model import LogisticRegressionCV

# lbfgs solver, l2 penalty
clf = LogisticRegressionCV(cv=10, random_state=0, max_iter=500).fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm # no primes predicted at all

---> features are probably not strong enough to help with the identification of primes