In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn import naive_bayes


In [2]:
# (X_train, y_train), (X_test, y_test) = mnist.load_data()
train_df = pd.read_csv("data/mnist_train.csv")
test_df = pd.read_csv("data/mnist_test.csv")

In [3]:
X_train_2d = train_df.drop("label", axis=1).values.reshape(60000, 28,28)
X_test_2d = test_df.drop("label", axis=1).values.reshape(10000, 28,28)
X_train_1d = X_train_2d.reshape(60000, 784)
X_test_1d = X_test_2d.reshape(10000, 784)
y_train = train_df["label"]
y_test = test_df["label"]

In [4]:
y_count = np.zeros(10)
for label in y_train:
    y_count[label] += 1

print("y_counts: ", y_count, y_count.sum(), "\n")

# Probabilities per Label
label_probs = y_count/y_count.sum()*100
for i in range(10):
    print(f"Probability for Class {i}: {label_probs[i]:.2f} %")

y_counts:  [5923. 6742. 5958. 6131. 5842. 5421. 5918. 6265. 5851. 5949.] 60000.0 

Probability for Class 0: 9.87 %
Probability for Class 1: 11.24 %
Probability for Class 2: 9.93 %
Probability for Class 3: 10.22 %
Probability for Class 4: 9.74 %
Probability for Class 5: 9.04 %
Probability for Class 6: 9.86 %
Probability for Class 7: 10.44 %
Probability for Class 8: 9.75 %
Probability for Class 9: 9.92 %


In [5]:
x_mid = (X_train_2d.shape[1]+1)/2
y_mid = (X_train_2d.shape[2]+1)/2
center = np.array((x_mid, y_mid))
print(center)

[14.5 14.5]


In [6]:
def get_euclidian_distance(arr):
    """
    Function that transforms colored pixels to euclidian distances
    """
    euclid_array = []

    for img in arr:
        x = 1
        new_img = []
        for row in img:
            y = 1
            # x_colored = 0
            distance_x = (pow(center[0] - x, 2))
            # print("dist x:", center[0] - x)
            for col in row:
                # y_colored = 0
                if col > 0:
                    distance_y = (pow(center[1] - y, 2))
                    # y_colored = 1
                    new_img.append(round(math.sqrt(distance_x+distance_y), 2)) #, y_colored))
                else:
                    new_img.append(0)
                y += 1
            x += 1
            
        euclid_array.append(new_img)
    return euclid_array


In [20]:
from sklearn.metrics import classification_report

def train_test_data(X_train, y_train, X_test, y_test):
    nb = naive_bayes.GaussianNB()
    nb.fit(X_train, y_train)
    pred = nb.predict(X_test)
    print("Never predicted values: ", set(y_test) - set(pred), "\n")

    print(classification_report(y_test, pred))

In [8]:
import scipy as sp

def shift(cpytest):
    """
    Returns 2D images
    """
    n = 5  # Skalierungsvariable

    shift_array = [(n, 0), (-n, 0), (0, n), (0, -n)]
    x = np.random.randint(1, 4)
    shift_test = []

    for i in range(cpytest.shape[0]):
        x = 2
        shift_test.append(sp.ndimage.interpolation.shift(cpytest[i], [shift_array[x][0], shift_array[x][1]], cval=0,
                                                       mode='constant', order=0))

    return shift_test



In [9]:
from scipy.ndimage import rotate

def rotation(cpytest):
    """
    Returns 2D images
    """
    rotated = []
    for i in range(cpytest.shape[0]):
        rotated.append(rotate(cpytest[i], 90))
    return rotated

## Default Accuracy

In [10]:
# Performance without manipulation
train_test_data(X_train_1d, y_train, X_test_1d, y_test)


              precision    recall  f1-score   support

           0       0.79      0.89      0.84       980
           1       0.85      0.95      0.90      1135
           2       0.90      0.26      0.40      1032
           3       0.71      0.35      0.47      1010
           4       0.88      0.17      0.29       982
           5       0.55      0.05      0.09       892
           6       0.65      0.93      0.77       958
           7       0.88      0.27      0.42      1028
           8       0.28      0.67      0.40       974
           9       0.37      0.95      0.53      1009

    accuracy                           0.56     10000
   macro avg       0.69      0.55      0.51     10000
weighted avg       0.69      0.56      0.52     10000



## Transform Pixel values to Euclidian Distance

In [11]:
# Pixel value -> Euclidian distance to center

# Get Euclidian Distances of rotated Images
train_distances = np.array(get_euclidian_distance(X_train_2d))
test_distances = np.array(get_euclidian_distance(X_test_2d))

# Test accuracy
train_test_data(train_distances, y_train, test_distances, y_test)


              precision    recall  f1-score   support

           0       0.66      0.90      0.76       980
           1       0.93      0.92      0.93      1135
           2       0.90      0.20      0.33      1032
           3       0.66      0.31      0.42      1010
           4       0.89      0.13      0.22       982
           5       0.48      0.03      0.05       892
           6       0.64      0.94      0.76       958
           7       0.87      0.23      0.37      1028
           8       0.29      0.72      0.42       974
           9       0.37      0.94      0.53      1009

    accuracy                           0.54     10000
   macro avg       0.67      0.53      0.48     10000
weighted avg       0.68      0.54      0.49     10000



## Rotation

In [12]:
# Rotate Test Data
test_rotated = rotation(X_test_2d)

# Get Euclidian Distances of rotated Images
train_distances = np.array(get_euclidian_distance(X_train_2d))
test_rotated_distances = np.array(get_euclidian_distance(test_rotated))

# Test accuracy
train_test_data(train_distances, y_train, test_rotated_distances, y_test)


              precision    recall  f1-score   support

           0       0.25      0.56      0.35       980
           1       0.00      0.00      0.00      1135
           2       0.08      0.09      0.09      1032
           3       0.02      0.01      0.02      1010
           4       0.02      0.02      0.02       982
           5       0.00      0.00      0.00       892
           6       0.00      0.01      0.01       958
           7       0.00      0.00      0.00      1028
           8       0.02      0.02      0.02       974
           9       0.01      0.01      0.01      1009

    accuracy                           0.07     10000
   macro avg       0.04      0.07      0.05     10000
weighted avg       0.04      0.07      0.05     10000



In [18]:
# Rotation Performance NO EUCLIDIAN

# Shift Test Data
test_rotated = rotation(X_test_2d)
test_rotated = np.array(test_rotated).reshape(10000, 784)

# Test accuracy
train_test_data(X_train_1d, y_train, test_rotated, y_test)

set()
              precision    recall  f1-score   support

           0       0.28      0.52      0.36       980
           1       0.00      0.00      0.00      1135
           2       0.06      0.08      0.07      1032
           3       0.01      0.01      0.01      1010
           4       0.02      0.03      0.03       982
           5       0.02      0.00      0.00       892
           6       0.01      0.01      0.01       958
           7       0.00      0.00      0.00      1028
           8       0.02      0.03      0.02       974
           9       0.01      0.01      0.01      1009

    accuracy                           0.07     10000
   macro avg       0.04      0.07      0.05     10000
weighted avg       0.04      0.07      0.05     10000



## Shift

In [19]:
# Shift Test Data
test_shift = shift(X_test_2d)

# Get Euclidian Distances of rotated Images
train_distances = get_euclidian_distance(X_train_2d)
test_shift_distances = get_euclidian_distance(test_shift)

# Test accuracy
train_test_data(train_distances, y_train, test_shift_distances, y_test)


{3}
              precision    recall  f1-score   support

           0       0.04      0.03      0.04       980
           1       0.00      0.00      0.00      1135
           2       0.13      0.31      0.18      1032
           3       0.00      0.00      0.00      1010
           4       0.30      0.14      0.19       982
           5       0.25      0.57      0.35       892
           6       0.55      0.31      0.40       958
           7       0.76      0.12      0.21      1028
           8       0.17      0.44      0.24       974
           9       0.37      0.25      0.29      1009

    accuracy                           0.21     10000
   macro avg       0.26      0.22      0.19     10000
weighted avg       0.25      0.21      0.19     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Shift Performance NO EUCLIDIAN

# Shift Test Data
test_shift = shift(X_test_2d)
test_shift = np.array(test_shift).reshape(10000, 784)

# Test accuracy
train_test_data(X_train_1d, y_train, test_shift, y_test)


              precision    recall  f1-score   support

           0       0.07      0.04      0.05       980
           1       0.00      0.00      0.00      1135
           2       0.15      0.34      0.20      1032
           3       0.00      0.00      0.00      1010
           4       0.30      0.18      0.23       982
           5       0.25      0.55      0.34       892
           6       0.43      0.32      0.37       958
           7       0.75      0.11      0.19      1028
           8       0.18      0.46      0.25       974
           9       0.36      0.25      0.30      1009

    accuracy                           0.22     10000
   macro avg       0.25      0.23      0.19     10000
weighted avg       0.25      0.22      0.19     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
