In [None]:
## KUC
# NLP
# task: self-define a way of vectorize a list of strings, perform hamming distance to calculate similarity, and then finally find the most similar vectors.

In [12]:
# 1. compute similarity
# 2. find most similar vectors.

import numpy as np

In [13]:
def vectorize_terms(terms):
    terms = [term.lower() for term in terms]
    terms = [np.array(list(term)) for term in terms]
    terms = [np.array([ord(char) for char in term])
                for term in terms]
    return terms

In [14]:
root = 'NLP '
term1 = 'natural language processing'
term2 = 'nature and science'
term3 = 'image processing'

terms = [root, term1, term2, term3]
terms


['NLP ',
 'natural language processing',
 'nature and science',
 'image processing']

In [15]:
# Character vectorization
term_vectors = vectorize_terms(terms)
print(term_vectors)

[array([110, 108, 112,  32]), array([110,  97, 116, 117, 114,  97, 108,  32, 108,  97, 110, 103, 117,
        97, 103, 101,  32, 112, 114, 111,  99, 101, 115, 115, 105, 110,
       103]), array([110,  97, 116, 117, 114, 101,  32,  97, 110, 100,  32, 115,  99,
       105, 101, 110,  99, 101]), array([105, 109,  97, 103, 101,  32, 112, 114, 111,  99, 101, 115, 115,
       105, 110, 103])]


In [16]:
print(root)
print(vectorize_terms(root))


NLP 
[array([110]), array([108]), array([112]), array([32])]


In [30]:
import pandas as pd
# show vector representations
print(terms)
print(term_vectors)
vec_df = pd.DataFrame(term_vectors, index=terms)
print(vec_df)

['NLP ', 'natural language processing', 'nature and science', 'image processing']
[array([110, 108, 112,  32]), array([110,  97, 116, 117, 114,  97, 108,  32, 108,  97, 110, 103, 117,
        97, 103, 101,  32, 112, 114, 111,  99, 101, 115, 115, 105, 110,
       103]), array([110,  97, 116, 117, 114, 101,  32,  97, 110, 100,  32, 115,  99,
       105, 101, 110,  99, 101]), array([105, 109,  97, 103, 101,  32, 112, 114, 111,  99, 101, 115, 115,
       105, 110, 103])]
                              0    1    2    3      4      5      6      7   \
NLP                          110  108  112   32    NaN    NaN    NaN    NaN   
natural language processing  110   97  116  117  114.0   97.0  108.0   32.0   
nature and science           110   97  116  117  114.0  101.0   32.0   97.0   
image processing             105  109   97  103  101.0   32.0  112.0  114.0   

                                8      9   ...     17     18     19    20  \
NLP                            NaN    NaN  ...    NaN  

In [25]:
root_term = root
other_terms = [term1, term2, term3]

root_term_vec = vec_df[vec_df.index == root_term].dropna(axis=1).values[0]
other_term_vecs = [vec_df[vec_df.index == term].dropna(axis=1).values[0]
                      for term in other_terms]

In [31]:
def hamming_distance(u, v, norm=False):
    if u.shape != v.shape:
        raise ValueError('The vectors must have equal lengths.')
    return (u != v).sum() if not norm else (u != v).mean()

In [35]:
print(vec_df.iloc[0])

0     110.0
1     108.0
2     112.0
3      32.0
4       NaN
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
10      NaN
11      NaN
12      NaN
13      NaN
14      NaN
15      NaN
16      NaN
17      NaN
18      NaN
19      NaN
20      NaN
21      NaN
22      NaN
23      NaN
24      NaN
25      NaN
26      NaN
Name: NLP , dtype: float64


In [36]:
i0 = vec_df.iloc[0]
i1 = vec_df.iloc[1]
d = hamming_distance(i0,i1)
print(d)


26
