In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import lux
import matplotlib.pyplot as plt
%matplotlib inline

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

np.set_printoptions(suppress=True) # do not print scientific notation

# Set some matplotlib style parameters
plt.style.use('seaborn')
plt.rcParams.update({'figure.facecolor': 'white',
                     'figure.edgecolor': 'white',
                     'axes.grid': True,
                     'figure.autolayout': True # tight_layout
                    })

# Set some pandas options
pd.set_option('display.max_columns', None, # show x columns (None shows all)
              # 'display.max_rows', None,  # show x rows (None shows all)
              'compute.use_numba', True)   # turn off for Lux

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
seed = 42
rng = np.random.default_rng(seed)

secure_bits = 25        # rows
corrected_key_bits = 50 # columns
N_vec = 10000           # nº of corrected key vectors

## Functions

### Detect operations

In [3]:
def Count_1times1(mat, vec):
    count = 0
    for i in range(secure_bits):
        for j in range(corrected_key_bits):
            if mat[i, j] and vec[j]:
                count += 1
    return count

In [4]:
def Count_0times1(mat, vec):
    count = 0
    for i in range(secure_bits):
        for j in range(corrected_key_bits):
            if (not mat[i, j] and vec[j]) or \
               (mat[i, j] and not vec[j]):
                count += 1
    return count

In [5]:
def Count_0times0(mat, vec):
    count = 0
    for i in range(secure_bits):
        for j in range(corrected_key_bits):
            if not mat[i, j] and not vec[j]:
                count += 1
    return count

In [6]:
# Some experiments...

# # Count all +1 (same as Count_1times1...)
# def Count_plus1(mat, vec):
#     count = 0
#     for i in range(secure_bits):
#         for j in range(corrected_key_bits):
#             if mat[i, j] * vec[j]:
#                 count += 1
#     return count

# Only count additions to initial 1 in each row (as if the first 1 was already in the memory address of the result)
def Count_plus1(mat, vec):
    count = 0
    for i in range(secure_bits):
        count_1s = 0
        for j in range(corrected_key_bits):
            if mat[i, j] * vec[j]:
                count_1s += 1 # count 1s in each row
        if count_1s != 0:
            count += count_1s - 1 # sum 1+1 additions per row if there are any 1s
    return count

# Count 1+1 sequentially in each row (works even if operations are not sequential)
    # (counting 1+0 or 0+0 gives different results if operations are not sequential,
    # but counting (1+0)+(0+0) still gives consistent results)
def Count_1plus1(mat, vec):
    count = 0
    for i in range(secure_bits):
        result = mat[i, 0] * vec[0]
        for j in range(1, corrected_key_bits):
            next_element = mat[i, j] * vec[j]
            if result and next_element:
                count += 1
            result = (result + next_element) % 2
    return count

### Class

In [7]:
# Detect if there are more 1s than 0s
def More_1_than_0(mat, vec):
    return np.sum(mat @ vec % 2) > secure_bits / 2

### Other

In [8]:
# Check if output vectors from sample are unique
def Check_uniqueness(mat, vec):
    res = []
    for i in range(len(vec)):
        res.append(mat @ vec[i] % 2)
    unique = np.unique(res, axis=0, return_counts=True)
    return unique[1].shape[0] == np.shape(vec)[0], unique[1].shape[0]

## Generate data

In [9]:
mat = rng.integers(2, size=(secure_bits,corrected_key_bits), dtype=np.int8)

mat.shape, mat

((25, 50),
 array([[1, 0, 1, ..., 1, 0, 0],
        [0, 1, 0, ..., 1, 0, 1],
        [0, 0, 1, ..., 1, 1, 1],
        ...,
        [1, 0, 0, ..., 0, 0, 1],
        [0, 1, 1, ..., 0, 1, 0],
        [1, 0, 0, ..., 1, 0, 1]], dtype=int8))

In [10]:
vec = [rng.integers(2, size=corrected_key_bits, dtype=np.int8) for _ in range(N_vec)]

np.shape(vec), vec[0]

((10000, 50),
 array([1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
        0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
        0, 1, 1, 1, 1, 0], dtype=int8))

In [11]:
print('Are solutions from this sample unique? %s ---> %i unique sols out of %i' % (*Check_uniqueness(mat, vec), N_vec))

Are solutions from this sample unique? False ---> 9999 unique sols out of 10000


In [12]:
print("1*1 \t 0*1 \t 0*0 \t +1 \t 1+1")
print("------------------------------------")
for i in range(10):
    print(Count_1times1(mat, vec[i]), '\t', Count_0times1(mat, vec[i]), '\t', Count_0times0(mat, vec[i]), '\t',
          Count_plus1(mat, vec[i]), '\t', Count_1plus1(mat, vec[i]))

1*1 	 0*1 	 0*0 	 +1 	 1+1
------------------------------------
337 	 639 	 274 	 312 	 163
342 	 654 	 254 	 317 	 163
320 	 623 	 307 	 295 	 154
274 	 590 	 386 	 249 	 134
350 	 613 	 287 	 325 	 170
266 	 606 	 378 	 241 	 126
254 	 630 	 366 	 229 	 120
299 	 615 	 336 	 274 	 143
309 	 645 	 296 	 284 	 148
228 	 607 	 415 	 203 	 108


In [13]:
print("#1 > #0?")
print("--------")
for i in range(10):
    print(More_1_than_0(mat, vec[i]), '\t', mat @ vec[i] % 2)

#1 > #0?
--------
False 	 [0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0 1 1 0 0 0]
True 	 [1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0]
False 	 [1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 0 1 0 1 0 0]
False 	 [0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0]
False 	 [1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0]
True 	 [1 0 0 1 1 0 1 0 0 1 0 1 1 1 1 0 0 0 0 1 1 1 0 1 1]
True 	 [0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 1]
True 	 [1 1 0 1 0 1 0 1 1 0 1 1 0 0 1 1 0 0 1 1 0 1 0 0 0]
True 	 [0 1 1 0 1 1 1 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 0 1 1]
False 	 [1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0]


In [14]:
data = [[Count_1times1(mat, vec[i]),
         Count_0times1(mat, vec[i]),
         Count_0times0(mat, vec[i]),
         # Count_plus1(mat, vec[i]),
         Count_1plus1(mat, vec[i]),
         More_1_than_0(mat, vec[i])] for i in range(N_vec)]
np.shape(data)

(10000, 5)

In [15]:
# Create dataframe
df = pd.DataFrame(data, columns=['1*1', '0*1', '0*0', '1+1', 'More_1_than_0'])

# Generate parities
    # with the new Count_1plus1, the models do not need any parities to predict 100% correctly
# df['1_parity'] = np.where(df['1*1'] % 2 == 0, 0, 1)
# df['0*1_parity'] = np.where(df['0*1'] % 2 == 0, 0, 1)
# df['0*0_parity'] = np.where(df['0*0'] % 2 == 0, 0, 1)
# df['+1_parity'] = np.where(df['+1'] % 2 == 0, 0, 1)
# df['1+1_parity'] = np.where(df['1+1'] % 2 == 0, 0, 1)
df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [18]:
# Check some trends in data (turn off numba engine first)
pd.set_option('compute.use_numba', False)
df2 = df.copy()
df2.intent = ["More_1_than_0", "1*1"]
display(df2)
pd.set_option('compute.use_numba', True)

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [19]:
# Generate pandas profiling report
profile = ProfileReport(df, title="More_1s_or_0s N_Total_Ops Dataset Report", explorative=True)
# profile.to_widgets()
profile.to_file("Docs/More_1s_or_0s_-_N_Total_Ops_report.html")

Summarize dataset:   0%|          | 0/18 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Split data

In [16]:
# Split data into training & test set
X = np.asarray(df.drop('More_1_than_0', axis=1))
y = np.asarray(df['More_1_than_0'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed) # used before cross-val

## Classifiers

Models able to to predict with ~100% accuracy:
- Logistic regression
- Linear SVM (and polynomial-kernel SVM by extension)

Models which comparatively suck:
- RBF-kernel SVM
- Sigmoid-kernel SVM
- All Naïve Bayes

In [17]:
# Train model with stratified k-fold cross-validation & get statistics
def Train_stratCV(model, X, y, n_splits=5, seed=seed):
    skf = StratifiedKFold(n_splits, shuffle=True, random_state=seed)
    lst_acc    = []
    lst_y_pred = [] # not used yet

    # Train & test each fold
    for train_idx, test_idx in skf.split(X, y):
        x_train_fold, x_test_fold = X[train_idx], X[test_idx]
        y_train_fold, y_test_fold = y[train_idx], y[test_idx]
        model.fit(x_train_fold, y_train_fold)
        lst_acc.append(model.score(x_test_fold, y_test_fold))
        lst_y_pred.append(model.predict(x_test_fold))

    print(f'\nList of accuracies: {lst_acc}')
    print(f'Min/max accuracy:   {min(lst_acc):.2%} <---> {max(lst_acc):.2%}')
    print(f'Mean accuracy:      {np.mean(lst_acc):.2%} +- {np.std(lst_acc):.3%}%')
    
    # add more stats
    
    return lst_acc, lst_y_pred

# accuracy_score(y_test, y_pred) # how often is the classifier correct?
# confusion_matrix(y_test, y_pred, normalize='pred') # precision
# confusion_matrix(y_test, y_pred, normalize='true') # recall
# classification_report(y_test, y_pred, output_dict=True)

### Logistic regression

In [18]:
model = SGDClassifier(loss='log', learning_rate='adaptive', eta0=1e5, tol=1e-9, n_jobs=-1, random_state=seed, verbose=-1)
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)

Convergence after 126 epochs took 0.12 seconds
Convergence after 135 epochs took 0.13 seconds
Convergence after 141 epochs took 0.13 seconds
Convergence after 123 epochs took 0.12 seconds
Convergence after 131 epochs took 0.14 seconds

List of accuracies: [1.0, 1.0, 1.0, 1.0, 1.0]
Min/max accuracy:   100.00% <---> 100.00%
Mean accuracy:      100.00% +- 0.000%%


In [182]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

Confusion matrix: 
 [[ 997    0]
 [   0 1003]]

Report: 
               precision    recall  f1-score   support

       False      1.000     1.000     1.000       997
        True      1.000     1.000     1.000      1003

    accuracy                          1.000      2000
   macro avg      1.000     1.000     1.000      2000
weighted avg      1.000     1.000     1.000      2000

Mean accuracy on test set: 100.00%
Mean certainty (even if wrong): 100.00%


### SVM

#### Linear kernel

In [195]:
model = SVC(kernel='linear', probability=True, random_state=seed, verbose=True)
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]
List of accuracies: [1.0, 1.0, 1.0, 1.0, 1.0]
Min/max accuracy:   100.00% <---> 100.00%
Mean accuracy:      100.00% +- 0.000%


In [646]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

In [185]:
model = SGDClassifier(loss='hinge', learning_rate='adaptive', eta0=1e3, tol=1e-9, n_jobs=-1, random_state=seed, verbose=-1)
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)

Convergence after 98 epochs took 0.04 seconds
Convergence after 113 epochs took 0.04 seconds
Convergence after 128 epochs took 0.10 seconds
Convergence after 103 epochs took 0.08 seconds
Convergence after 103 epochs took 0.09 seconds

List of accuracies: [1.0, 1.0, 1.0, 1.0, 1.0]
Min/max accuracy:   100.00% <---> 100.00%
Mean accuracy:      100.00% +- 0.000%


In [186]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

In [187]:
model = LinearSVC(loss='squared_hinge', dual=False, tol=1e-9, random_state=seed, verbose=1)
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]
List of accuracies: [1.0, 1.0, 1.0, 1.0, 1.0]
Min/max accuracy:   100.00% <---> 100.00%
Mean accuracy:      100.00% +- 0.000%


In [188]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

#### Gaussian kernel

In [194]:
model = SVC(kernel='rbf', tol=1e-9, probability=True, random_state=seed, verbose=True)
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]
List of accuracies: [0.5885, 0.5955, 0.61, 0.6235, 0.5655]
Min/max accuracy:   56.55% <---> 62.35%
Mean accuracy:      59.66% +- 1.968%


In [189]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

#### Polynomial kernel

In [193]:
model = SVC(kernel='poly', degree=6, tol=1e-9, probability=True, random_state=seed, verbose=True)
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]
List of accuracies: [1.0, 1.0, 1.0, 1.0, 1.0]
Min/max accuracy:   100.00% <---> 100.00%
Mean accuracy:      100.00% +- 0.000%


In [657]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

#### Sigmoid kernel

In [192]:
model = SVC(kernel='sigmoid', tol=1e-9, probability=True, random_state=seed, verbose=True)
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]
List of accuracies: [0.5015, 0.5015, 0.5015, 0.5015, 0.502]
Min/max accuracy:   50.15% <---> 50.20%
Mean accuracy:      50.16% +- 0.020%


In [658]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

### Naïve Bayes

#### Gaussian NB

In [190]:
model = GaussianNB()
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)


List of accuracies: [0.514, 0.518, 0.504, 0.5285, 0.497]
Min/max accuracy:   49.70% <---> 52.85%
Mean accuracy:      51.23% +- 1.096%


In [72]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

#### Multinomial NB

In [191]:
model = MultinomialNB()
# model.fit(X_train, y_train) # used before cross-val
acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed)


List of accuracies: [0.509, 0.521, 0.528, 0.5345, 0.51]
Min/max accuracy:   50.90% <---> 53.45%
Mean accuracy:      52.05% +- 0.995%


In [43]:
# # Get prediction statistics (used before cross-val)
# y_pred = model.predict(X_test)
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
# print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

# print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

# y_prob = model.predict_proba(X_test)
# print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

#### Categorical NB

In [183]:
model = CategoricalNB()
model.fit(X_train, y_train) # used before cross-val
# acc_strat, y_pred = Train_stratCV(model, X, y, 5, seed) # not working

CategoricalNB()

In [184]:
# Get prediction statistics (used before cross-val)
y_pred = model.predict(X_test)
print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
print('\nReport: \n', classification_report(y_test, y_pred, digits=3))

print(f'Mean accuracy on test set: {model.score(X_test, y_test):.2%}')

y_prob = model.predict_proba(X_test)
print(f'Mean certainty (even if wrong): {np.mean(np.max(y_prob, axis=1)):.2%}')

Confusion matrix: 
 [[495 502]
 [431 572]]

Report: 
               precision    recall  f1-score   support

       False      0.535     0.496     0.515       997
        True      0.533     0.570     0.551      1003

    accuracy                          0.533      2000
   macro avg      0.534     0.533     0.533      2000
weighted avg      0.534     0.533     0.533      2000

Mean accuracy on test set: 53.35%
Mean certainty (even if wrong): 64.17%
