## Import Libraries

In [1]:
from pprint import pprint
from sklearn import datasets
from keras.datasets import mnist
from matplotlib import pyplot as plt
from nltk.metrics.distance import edit_distance
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np

In [2]:
# mnist_global = datasets.load_digits()

def plot(data):
    plt.imshow(data, cmap=plt.get_cmap('gray'))
    plt.show()

def normalized(matrix):
    return (matrix - np.mean(matrix)) / np.std(matrix)

## MNIST

In [3]:
def mnist_edit_distance(image1_data, image2_data, threshold):
    count = 0
    image1_elements, image2_elements = [x for x in image1_data], [v for v in image2_data]

    for image1_element, image2_element in zip(image1_elements, image2_elements):
        if abs(image1_element - image2_element) > threshold:
            count += 1
    return count

In [4]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = np.reshape(train_X, (60000, 784))

mnist_normalized = normalized(train_X)
print('mean:', np.mean(mnist_normalized))
print('std:', np.std(mnist_normalized))
print('var:', np.var(mnist_normalized))
print('matrix shape:', mnist_normalized.shape)

mean: -3.064638490070051e-17
std: 0.9999999999999998
var: 0.9999999999999997
matrix shape: (60000, 784)


In [5]:
test_mnist = mnist_normalized[:50]

edit_distances = []

for image1_data in test_mnist:
    for image2_data in test_mnist:
        edit_distances.append(mnist_edit_distance(image1_data, image2_data, 1))

cosine_similarity_matrix = cosine_similarity(mnist_normalized[:500])
euclidean_distance_matrix = euclidean_distances(mnist_normalized[:500])
manhattan_distance_matrix = manhattan_distances(mnist_normalized[:500])
pprint(cosine_similarity_matrix)
pprint(euclidean_distance_matrix)
pprint(manhattan_distance_matrix)
print(edit_distances)

array([[1.        , 0.459261  , 0.08648314, ..., 0.23172054, 0.35535879,
        0.40716007],
       [0.459261  , 1.        , 0.06573546, ..., 0.25461644, 0.20794138,
        0.37826232],
       [0.08648314, 0.06573546, 1.        , ..., 0.19068928, 0.20659651,
        0.03201346],
       ...,
       [0.23172054, 0.25461644, 0.19068928, ..., 1.        , 0.25095736,
        0.31486826],
       [0.35535879, 0.20794138, 0.20659651, ..., 0.25095736, 1.        ,
        0.42663946],
       [0.40716007, 0.37826232, 0.03201346, ..., 0.31486826, 0.42663946,
        1.        ]])
array([[ 0.        , 30.38294394, 35.29639439, ..., 33.31114554,
        29.63740411, 30.71395453],
       [30.38294394,  0.        , 36.8973593 , ..., 33.90007526,
        33.92085403, 32.37877928],
       [35.29639439, 36.8973593 ,  0.        , ..., 31.04875933,
        29.50874907, 36.03002442],
       ...,
       [33.31114554, 33.90007526, 31.04875933, ...,  0.        ,
        29.7095584 , 31.23227272],
       [29.

## 20 NG

In [6]:
newsgroups_space = datasets.fetch_20newsgroups(subset='train', categories=['sci.space'])

In [7]:
# vectorizer = TfidfVectorizer()
# vectors_space = vectorizer.fit_transform(newsgroups_space.data)
# vectors_space = normalized(vectors_space.toarray())

In [8]:
vectorizer = TfidfVectorizer()
vectors_space = vectorizer.fit_transform(newsgroups_space.data)
vectors_space = normalized(vectors_space.toarray())

edit_distances = []
for text1 in newsgroups_space.data[:5]:
    for text2 in newsgroups_space.data[:5]:
        edit_distances.append(edit_distance(text1, text2, 1))

print(vectors_space.shape)

(593, 15893)


In [9]:
pprint(cosine_similarity(vectors_space))
pprint(euclidean_distances(vectors_space))
pprint(manhattan_distances(vectors_space))
print(edit_distances)

array([[1.        , 0.24315652, 0.17906923, ..., 0.10564172, 0.06759409,
        0.14592553],
       [0.24315652, 1.        , 0.10132218, ..., 0.04580929, 0.03166615,
        0.05470007],
       [0.17906923, 0.10132218, 1.        , ..., 0.08025553, 0.05861567,
        0.14656795],
       ...,
       [0.10564172, 0.04580929, 0.08025553, ..., 1.        , 0.05818513,
        0.08782462],
       [0.06759409, 0.03166615, 0.05861567, ..., 0.05818513, 1.        ,
        0.07486767],
       [0.14592553, 0.05470007, 0.14656795, ..., 0.08782462, 0.07486767,
        1.        ]])
array([[  0.        , 155.229679  , 161.51484278, ..., 168.5419579 ,
        172.24800082, 164.75566255],
       [155.229679  ,   0.        , 169.10499392, ..., 174.20685193,
        175.65440659, 173.44927036],
       [161.51484278, 169.10499392,   0.        , ..., 170.8709713 ,
        173.02857183, 164.6491396 ],
       ...,
       [168.5419579 , 174.20685193, 170.8709713 , ...,   0.        ,
        173.02557374, 17