# Linear algebra: the similarity of texts and approximation of functions

In [1]:
import numpy as np

#### Read data from a file

In [2]:
file = open('sentences.txt')
print(file)

<_io.TextIOWrapper name='sentences.txt' mode='r' encoding='UTF-8'>


In [3]:
sentences = np.array([sentence.lower() for sentence in file.readlines()])
file.close()

In [4]:
print('Count: ', len(sentences))
print(sentences[:3])

Count:  22
[ 'in comparison to dogs, cats have not undergone major changes during the domestication process.\n'
 'as cat simply catenates streams of bytes, it can be also used to concatenate binary files, where it will just concatenate sequence of bytes.\n'
 'a common interactive use of cat for a single file is to output the content of a file to standard output.\n']


#### Map sentences list to list of sentenc words

In [5]:
import re

In [6]:
def to_words(sentences):
    words = [re.split('[^a-z]', anSentence) for anSentence in sentences]
    filtered = [list(filter(None, item)) for item in words]
    
    return np.array(filtered)

In [7]:
sentence_words = to_words(sentences)

In [8]:
print(sentence_words[:2])

[ list(['in', 'comparison', 'to', 'dogs', 'cats', 'have', 'not', 'undergone', 'major', 'changes', 'during', 'the', 'domestication', 'process'])
 list(['as', 'cat', 'simply', 'catenates', 'streams', 'of', 'bytes', 'it', 'can', 'be', 'also', 'used', 'to', 'concatenate', 'binary', 'files', 'where', 'it', 'will', 'just', 'concatenate', 'sequence', 'of', 'bytes'])]


#### Map word to index

In [9]:
word_to_index = dict()

In [10]:
index = 0
for words in sentence_words:
    for word in words:
        if (word not in word_to_index):
            word_to_index[word] = index
            index += 1

In [11]:
print('Unique words: ', len(word_to_index))

Unique words:  254


#### Create matrix with word occurence in a specific sentence

In [12]:
matrix = np.zeros((len(sentences), len(word_to_index)))
print(matrix.shape)
print(matrix)

(22, 254)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [13]:
from collections import Counter

In [14]:
def count_words(words):
    counter = Counter()
    for word in words:
        counter[word] += 1
        
    return counter

In [15]:
for i in range(len(sentence_words)):
    words = sentence_words[i]
    counts = count_words(words)
    
    for word in words:
        word_index = word_to_index[word]
        word_count = counts[word]
        
        matrix[i][word_index] = word_count

In [17]:
print(matrix)

[[ 1.  1.  1. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  0.  2. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  1. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  1.  1.  1.]]


In [18]:
print('Index of in: ', word_to_index['in'])
in_index = 0

print('Counts of in:')
for i in range(len(sentences)):
    print(matrix[i][in_index])

Index of in:  0
Counts of in:
1.0
0.0
0.0
1.0
2.0
0.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0


#### Evaluate cos distance

In [19]:
from scipy.spatial.distance import cosine

In [20]:
key_sentence = matrix[0]
input_sentences = matrix[1:]

In [22]:
distances = [(i + 1, cosine(key_sentence, input_sentences[i])) for i in range(len(input_sentences))]
distances = sorted(distances, key=lambda distance: distance[1])

distances

[(6, 0.7327387580875756),
 (4, 0.77708871496985887),
 (21, 0.82503644694405864),
 (10, 0.83281653622739416),
 (12, 0.83964325485254543),
 (16, 0.84063618542208085),
 (20, 0.84275727449171223),
 (2, 0.86447381456421235),
 (13, 0.87035925528956715),
 (14, 0.87401184233025764),
 (11, 0.88047713906656067),
 (8, 0.88427248752843102),
 (19, 0.88854435748492944),
 (3, 0.89517151632780823),
 (9, 0.90550888174769317),
 (7, 0.92587506833388988),
 (5, 0.94023856953328033),
 (15, 0.94427217874246472),
 (18, 0.94427217874246472),
 (1, 0.95275444087384664),
 (17, 0.95664450152379399)]