# String distances




In [1]:
import nltk
import numpy as np
import pandas as pd
#import editdistance

In [3]:
!pip install nltk



In [15]:
import nltk

nltk.download('words')

[nltk_data] Downloading package words to /Users/qijunjin/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [16]:
words = nltk.corpus.words.words()

In [17]:
pd.DataFrame(words).to_csv("words_nltk.csv")

# Jaccard similarity


Jaccard Distance is a measure of dissimilarity for sets. The lower the dissimilarity, the more similar are two objects.

$$
\text{jaccard_similarity}(A,B) = \frac{| A \cap B |} {| A \cup B |} 
$$


$$
\text{jaccard_distance}(A,B) = 1 - \frac{| A \cap B |} {| A \cup B |} 
$$

In [18]:
def jaccard_similarity(s1,s2):
    return len(s1.intersection(s2)) / len(s1.union(s2))

def jaccard_distance(s1,s2):
    return 1. - jaccard_similarity(s1,s2)

In [19]:
jaccard_similarity(set("exponential"),set("polynomial"))

0.5454545454545454

In [20]:
jaccard_similarity(set("exponential"),set("exponentia"))

0.8888888888888888

In [21]:
len(set("exponential").union(set("polynomial"))), len(set("exponential").intersection(set("polynomial")))

(11, 6)

In [22]:
set1 = set('mapping')
set2 = set('mappings')
jaccard_similarity(set1,set2), jaccard_distance(set1,set2), jaccard_distance(set1,set1)

(0.8571428571428571, 0.1428571428571429, 0.0)

In [23]:
set1 = set('mapping')
set2 = set('mappings')
nltk.jaccard_distance(set1, set2)

0.14285714285714285

What is the issue with the similarity? Order of the characters not matter

In [24]:
set1 = set('panmpi')
set2 = set('mapping')
jaccard_distance(set1,set2)

0.16666666666666663

In [25]:
set1 = set('mapping')
set2 = set('mappin')
jaccard_distance(set1,set2)

0.16666666666666663

In [27]:
set1 = set('mapping')
set2 = set('mappin')
nltk.jaccard_distance(set1, set2) == jaccard_distance(set1, set2)

False

In [28]:
words     =  nltk.corpus.words.words()
words_set = [set(w) for w in words]
len(words)

236736

In [29]:
def compute_distances(query_word, words):
    distances = []

    for word in words:
        dist = jaccard_distance(query_word, word)
        distances.append(dist)
    return distances

def find_k_nearest_neighbors(query_word, words, K):
    distances = compute_distances(query_word, words)
    idx = np.argsort(distances)[::-1] 
    return [words[i] for i in idx[0:K]]

def find_k_nearest_indices(query_word, words, K, return_distances=False):
    distances = compute_distances(query_word, words)
    idx = np.argsort(distances) 
    
    if return_distances:
        return [i for i in idx[0:K]], [distances[i] for i in idx[0:K]]
    else:
        return [i for i in idx[0:K]]

In [30]:
query = set('guardin')
distances = compute_distances(query,words)
print(f"the closest word to query={query} is {words[np.argmin(distances)]}")

the closest word to query={'g', 'r', 'd', 'n', 'i', 'a', 'u'} is guardian


In [31]:
closest_words = [words[d] for d in np.argsort(distances)]
closest_words[0:10]

['unniggard',
 'gurniad',
 'guarding',
 'undaring',
 'guardian',
 'indiguria',
 'ungrained',
 'antidrug',
 'unarraigned',
 'underguardian']

In [32]:
query = set('compuler')
distances = compute_distances(query,words)
print(f"the closest word to query={query} is {words[np.argmin(distances)]}")

the closest word to query={'o', 'r', 'e', 'l', 'p', 'u', 'm', 'c'} is operculum


In [33]:
closest_words = [words[d] for d in np.argsort(distances)]
closest_words[0:10]

['operculum',
 'preoperculum',
 'scruplesome',
 'macropleural',
 'praeoperculum',
 'leucospermous',
 'coelospermous',
 'crumple',
 'pleurocele',
 'compeller']

#### Improving set feature descriptor 
Let us improve the returned closest words using the jaccard distance.

We can compute the jaccard distance using substrings of n characters from a string, if we do this we introduce local information about the order of substrings.

In [34]:
def build_n_chars(w, n_chars=2):
    if len(w)<n_chars:
        return set([w])
    else:
        return set([w[i:i+n_chars] for i in range(len(w)-n_chars+1)])

In [46]:
build_n_chars('compuler',3)

{'com', 'ler', 'mpu', 'omp', 'pul', 'ule'}

In [47]:
words_set_2chars = [build_n_chars(w,2) for w in words]

##### closest word to 'compuler'


In [48]:
query = 'compuler'
query_transformed = build_n_chars(query,2)

In [49]:
distances = compute_distances(query_transformed, words_set_2chars)
idx =np.argsort(distances)[::-1]
[words[i] for i in idx[-10:]]

['compulsion',
 'compulsory',
 'computable',
 'puller',
 'compulsed',
 'compeller',
 'completer',
 'compiler',
 'computer',
 'puler']

In [50]:
indices, distances = find_k_nearest_indices(query_transformed, words_set_2chars ,10, return_distances=True)
top_k_words = [words[i] for i in indices]
top_k_dists = distances
df_top_k =pd.DataFrame({'words':top_k_words,
                        'distances':top_k_dists})
df_top_k

Unnamed: 0,words,distances
0,puler,0.428571
1,computer,0.444444
2,compiler,0.444444
3,completer,0.5
4,compeller,0.5
5,compulsed,0.5
6,puller,0.5
7,computable,0.545455
8,compulsory,0.545455
9,compulsion,0.545455


##### feature vector with single letters and pairs

In [51]:
query

'compuler'

In [52]:
words_feature_set = [ set(w) | build_n_chars(w,2) for w in words]
query_transformed = set(query) | build_n_chars(query,2)

In [53]:
indices, distances = find_k_nearest_indices(query_transformed, words_feature_set ,10, return_distances=True)
top_k_words = [words[i] for i in indices]
top_k_dists = distances
df_top_k = pd.DataFrame({'words':top_k_words,
                        'distances':top_k_dists})
df_top_k

Unnamed: 0,words,distances
0,compiler,0.333333
1,compeller,0.333333
2,computer,0.333333
3,completer,0.368421
4,compulsed,0.4
5,puler,0.4
6,compole,0.411765
7,coruler,0.411765
8,compeer,0.411765
9,compoer,0.411765


# Edit distance

Given two strings `x` and `y` the edit distance is the cheapest possible sequence of **character edits** to transform `x` to `y`.

Character edits (or character transformations) are:

- Insert a character `c`
- Delete `c`
- Replace `c` by `c'`


In [55]:
mistake = "drauing" 
words = ['cat', 'draw', 'drawing', 'drought', 'linking',
        'living', 'dragon', 'handemore', 'eliot', 'queen']

distances = []
for word in words:
    ed = nltk.edit_distance(mistake, word)
    print("d({:10},{}) = {}".format(word,mistake,ed))
    distances.append(ed)

print("\nthe closest word is", words[np.argmin(distances)])

d(cat       ,drauing) = 6
d(draw      ,drauing) = 4
d(drawing   ,drauing) = 1
d(drought   ,drauing) = 4
d(linking   ,drauing) = 4
d(living    ,drauing) = 4
d(dragon    ,drauing) = 3
d(handemore ,drauing) = 9
d(eliot     ,drauing) = 6
d(queen     ,drauing) = 6

the closest word is drawing


In [56]:
def edit_distance_recursive(x,y):
    if len(x) ==0:
        return len(y)
    if len(y) == 0:
        return len(x)
    
    delta = 0 if x[-1] == y[-1] else 1
    return min(edit_distance_recursive(x[:-1],y[:-1]) + delta,
               edit_distance_recursive(x[:-1],y)      + 1,
               edit_distance_recursive(x,y[:-1])      + 1)
     

In [57]:
edit_distance_recursive("ab", "ac")

1

In [31]:
%%time
edit_distance_recursive("house", "hause")

CPU times: user 888 µs, sys: 0 ns, total: 888 µs
Wall time: 897 µs


1

In [40]:
%%time
dist_recursive = edit_distance_recursive("superman", "supermaniac")
print(f"distance by edit_distance_recursive {dist_recursive}\n")

distance by edit_distance_recursive 3

CPU times: user 1.13 s, sys: 9.25 ms, total: 1.14 s
Wall time: 1.14 s


In [41]:
t =  len(words)/(1.13*60*60)
print(f"time needed to compute the distance between a w and all vocab {t} hours")

time needed to compute the distance between a w and all vocab 0.0024582104228121925 hours


How come nltk needs less than 100 µs?

In [42]:
%%time
dist_nltk = nltk.edit_distance("superman", "supermaniac")
print(f"distance by nltk {dist_nltk}\n")

distance by nltk 3

CPU times: user 174 µs, sys: 42 µs, total: 216 µs
Wall time: 196 µs


Notice that the `edit_dinstance_recursive` calls itself with the same input values many times, making it very slow.

We can know the number of times it call itself using a global variable counter `n`. 

In [43]:
n = 0
def edit_distance_recursive(x,y):
    global n
    if len(x) ==0:
        return len(y)
    if len(y) == 0:
        return len(x)
    
    if x =="super" and y=="sup":
        n += 1
    
    delta = 0 if x[-1] == y[-1] else 1
    return min(edit_distance_recursive(x[:-1],y[:-1]) + delta,
               edit_distance_recursive(x[:-1],y)      + 1,
               edit_distance_recursive(x,y[:-1])      + 1)
     

In [44]:
edit_distance_recursive("superman", "supermaniac")
n

833

###### Other types of edit distance


You can put a diferent cost to the different operations performed when computing an edit distance

In [53]:
w1 = 'mapping'
w2 = 'mappings'
nltk.edit_distance(w1, w2,substitution_cost=1)

1

In [46]:
w1 = 'mapping'
w2 = 'mappink'
nltk.edit_distance(w1, w2,substitution_cost=2)

2

## Using a big list of words


Fast implementations are important when computing edit distances between lots of candidates

In [47]:
import editdistance

If we have a lot of words it's important to make sure we use a fast distance immplementation.

- `editdistance` provides a much faster implementation than `nltk`


In [48]:
words = nltk.corpus.words.words()
len(words)

236736

In [49]:
#%%time
mistake = "drauing" 
distances = []
for word in words:
    ed = nltk.edit_distance(mistake, word)
    distances.append(ed)
    
print("\nthe closest word is", words[np.argmin(distances)])


the closest word is drawing


In [50]:
import editdistance

In [51]:
#%%time
mistake = "drauing" 
distances2 = []
for word in words:
    ed = editdistance.eval(mistake, word)
    distances2.append(ed)
    
print("\nthe closest word is", words[np.argmin(distances2)])


the closest word is drawing


In [52]:
# both implementations provide the same results
distances2 == distances

True

## Implementing an edit distance using dynamic programming


The basic idea to implement the edit distance in an efficient way relies on understanding that if we have the edit distances between two prefixes of two strings we can use it to compute the edit distance between the prefixes appending them with a character.



###### Example

Compute `d("hi","ho")`

```
    ""    h      i
""  0     1      2 
h   1     d,i,s  d,i,s
o   2     d,i,s  d,i,s
```

Which ends up with 

```
    ""    h    i
""  0     1    2 
h   1     0    1 
o   2     1    1
```

Compute `d("hi","hill")`


```
    ""    h        i
""  0     1        2 
h   1     d,i,s    d,i,s
i   2     d,i,s    d,i,s
l   3     d,i,s    d,i,s
l   4     d,i,s    d,i,s
```

```
    ""    h        i
""  0     1        2 
h   1     1,1,0    d,i,s
i   2     d,i,s    d,i,s
l   3     d,i,s    d,i,s
l   4     d,i,s    d,i,s
```







```
    ""    h    i
""  0     1    2 
h   1     0    1 
i   2     1    0
l   3     2    1
l   4     3    2
```



##### Dynamic programming

1) Define subproblems

- $x[i:]$ and $y[j:]$ for all i and j
- #subproblems = $O(|x| |y|)$

2) Guess part of the overall solution

we want to map the first character of x to the first character of y.


```
x = [x_1 x_2 ...]
y = [y_1 y_2 ...]
```



3) Define a recurrence

4) Recurrence + memoization

5) Solve original problem

## Edit distance full algorithm

Initialization
```
D(i,0) = i
D(0,j) = j
```

That is:

```
    ""    ------------- x --------------- 
""  0      1      2               lx-1 lx
|   1      0      0               0    0
|   2      0      0               0    0
    .      .      .               .    .
y   .      .      .               .    .
    .      .      .               .    .
|   ly-1   0      0               0    0
|   ly     0      0               0    0
```



Recurrence
```
For each i in 1...M
    For each j in 1...N
        del_char = D(i-1,j) + 1
        ins_char = D(i,j-1) + 1
     
        if X[i] != Y[j]:
            Z = 2
        if X[i] = Y[j]
            Z = 0
         
        sub_char = D(i-1,j-1) + Z
        D(i,j) = min(del_char, ins_char, sub_char)
```

Termination

The result will be the last position of the array `D` once it is filled.

```
return D(N,M) 
```



In [53]:
def initialize_table(s1,s2):
    len_x = len(s1)
    len_y = len(s2)
    D = np.zeros((len_x+1,len_y+1))

In [54]:
s1 = "hill"
s2 = "hello"
len_x = len(s1)
len_y = len(s2)
D = np.zeros((len_x+1,len_y+1))
D[:,0] = range(len_x+1)
D[0,:] = range(len_y+1)
D

array([[0., 1., 2., 3., 4., 5.],
       [1., 0., 0., 0., 0., 0.],
       [2., 0., 0., 0., 0., 0.],
       [3., 0., 0., 0., 0., 0.],
       [4., 0., 0., 0., 0., 0.]])

In [55]:
def memoization_table(s1, s2, D):
    colnames = ["empty"] + [x for x in s2]
    rownames = ["empty"] + [x for x in s1]
    print(pd.DataFrame(D, rownames, columns=colnames))

In [56]:
memoization_table(s1,s2,D)

       empty    h    e    l    l    o
empty    0.0  1.0  2.0  3.0  4.0  5.0
h        1.0  0.0  0.0  0.0  0.0  0.0
i        2.0  0.0  0.0  0.0  0.0  0.0
l        3.0  0.0  0.0  0.0  0.0  0.0
l        4.0  0.0  0.0  0.0  0.0  0.0


In [57]:
D

array([[0., 1., 2., 3., 4., 5.],
       [1., 0., 0., 0., 0., 0.],
       [2., 0., 0., 0., 0., 0.],
       [3., 0., 0., 0., 0., 0.],
       [4., 0., 0., 0., 0., 0.]])

In [58]:
s1 = "EXPONENTIAL"
s2 = "POLYNOMIAL"
len_x = len(s1)
len_y = len(s2)
D = np.zeros((len_x + 1, len_y + 1))
D[:,0] = range(len_x + 1)
D[0,:] = range(len_y + 1)
D

array([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 6.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 7.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 8.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [10.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [11.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [59]:
s1 = "EXPONENTIAL"
s2 = "POLYNOMIAL"
len_x = len(s1)
len_y = len(s2)
D = np.zeros((len_x + 1,len_y + 1))
D[:,0] = range(len_x + 1)
D[0,:] = range(len_y + 1)
D

X = s1
Y = s2

w_sub = 1
w_del = 1
w_ins = 1

for i in range(1, len(X) + 1):
    for j in range(1, len(Y) + 1):
        
        del_char = D[i-1,j] + w_del
        ins_char = D[i,j-1] + w_ins
        
        if X[i-1] == Y[j-1]:
            Z = 0
        else:
            Z = w_sub
        sub_char = D[i-1,j-1] + Z

        D[i,j] = min(del_char, ins_char, sub_char)



In [60]:
memoization_table(X,Y,D)

       empty     P    O    L    Y    N    O    M    I    A     L
empty    0.0   1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
E        1.0   1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
X        2.0   2.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
P        3.0   2.0  3.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
O        4.0   3.0  2.0  3.0  4.0  5.0  5.0  6.0  7.0  8.0   9.0
N        5.0   4.0  3.0  3.0  4.0  4.0  5.0  6.0  7.0  8.0   9.0
E        6.0   5.0  4.0  4.0  4.0  5.0  5.0  6.0  7.0  8.0   9.0
N        7.0   6.0  5.0  5.0  5.0  4.0  5.0  6.0  7.0  8.0   9.0
T        8.0   7.0  6.0  6.0  6.0  5.0  5.0  6.0  7.0  8.0   9.0
I        9.0   8.0  7.0  7.0  7.0  6.0  6.0  6.0  6.0  7.0   8.0
A       10.0   9.0  8.0  8.0  8.0  7.0  7.0  7.0  7.0  6.0   7.0
L       11.0  10.0  9.0  8.0  9.0  8.0  8.0  8.0  8.0  7.0   6.0


In [61]:
# edit distance
D[-1,-1]

6.0

## Assuming same cost for all edits

In [62]:
def create_memoization_table(X,Y):

    len_x = len(X)
    len_y = len(Y)
    D = np.zeros((len_x+1,len_y+1))
    
    for i in range(len(X)+1):
        for j in range(len(Y)+1):

            if i == 0:
                D[i][j] = j    

            elif j == 0:
                D[i][j] = i  

            elif X[i-1] == Y[j-1]: 
                D[i][j] = D[i-1][j-1]

            else:
                D[i][j] = 1+min(D[i][j-1],      # Insert 
                                D[i-1][j],      # Remove 
                                D[i-1][j-1])    # Replace 
    return D

In [63]:
x = "EXPONENTIAL"
y = "POLYNOMIAL"
D = create_memoization_table(x,y)
memoization_table(x,y,D)
D[-1,-1]

       empty     P    O    L    Y    N    O    M    I    A     L
empty    0.0   1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
E        1.0   1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
X        2.0   2.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
P        3.0   2.0  3.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0
O        4.0   3.0  2.0  3.0  4.0  5.0  5.0  6.0  7.0  8.0   9.0
N        5.0   4.0  3.0  3.0  4.0  4.0  5.0  6.0  7.0  8.0   9.0
E        6.0   5.0  4.0  4.0  4.0  5.0  5.0  6.0  7.0  8.0   9.0
N        7.0   6.0  5.0  5.0  5.0  4.0  5.0  6.0  7.0  8.0   9.0
T        8.0   7.0  6.0  6.0  6.0  5.0  5.0  6.0  7.0  8.0   9.0
I        9.0   8.0  7.0  7.0  7.0  6.0  6.0  6.0  6.0  7.0   8.0
A       10.0   9.0  8.0  8.0  8.0  7.0  7.0  7.0  7.0  6.0   7.0
L       11.0  10.0  9.0  8.0  9.0  8.0  8.0  8.0  8.0  7.0   6.0


6.0

In [64]:
x = "Elliot"
y = "Elia"
D = create_memoization_table(x,y)
memoization_table(x,y,D)
D[-1,-1]

       empty    E    l    i    a
empty    0.0  1.0  2.0  3.0  4.0
E        1.0  0.0  1.0  2.0  3.0
l        2.0  1.0  0.0  1.0  2.0
l        3.0  2.0  1.0  1.0  2.0
i        4.0  3.0  2.0  1.0  2.0
o        5.0  4.0  3.0  2.0  2.0
t        6.0  5.0  4.0  3.0  3.0


3.0

In [65]:
x = "hi"
y = "hill"
D = create_memoization_table(x,y)
memoization_table(x,y,D)
print("\nThe distance between {} and {} is {}".format(x,y,D[-1,-1]))

       empty    h    i    l    l
empty    0.0  1.0  2.0  3.0  4.0
h        1.0  0.0  1.0  2.0  3.0
i        2.0  1.0  0.0  1.0  2.0

The distance between hi and hill is 2.0


##### Timing implementation

In [66]:
x = "EXPONENTIAL"
y = "POLYNOMIAL"

In [67]:
def edit_distance_fast(x,y):
    D = create_memoization_table(x,y)
    return D[-1,-1]

In [68]:
%%timeit
edit_distance_fast(x,y)

164 µs ± 945 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [69]:
%%timeit
nltk.edit_distance(x,y)

72.8 µs ± 762 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [70]:
nltk.edit_distance(x,y) == edit_distance_fast(x,y)

True

### Different costs per operation

In [71]:
def memoization_table_weighted(X,Y):

    len_x = len(X)
    len_y = len(Y)
    D = np.zeros((len_x + 1, len_y + 1))
    D[:,0] = range(len_x + 1)
    D[0,:] = range(len_y + 1)

    w_sub = 1
    w_del = 1
    w_ins = 1

    for i in range(1, len_x + 1):
        for j in range(1, len_y + 1):
            del_char = D[i-1,j] + w_del
            ins_char = D[i,j-1] + w_ins

            if X[i-1] == Y[j-1]:
                Z = 0
            else:
                Z = w_sub
            sub_char = D[i-1,j-1] + Z

            D[i,j] = min(del_char, ins_char, sub_char)

    return D

In [72]:
x = "Elliot"
y = "Elia"
D = memoization_table_weighted(x, y)
memoization_table(x, y, D)
print("\nThe distance between {} and {} is {}".format(x,y,D[-1,-1]))

       empty    E    l    i    a
empty    0.0  1.0  2.0  3.0  4.0
E        1.0  0.0  1.0  2.0  3.0
l        2.0  1.0  0.0  1.0  2.0
l        3.0  2.0  1.0  1.0  2.0
i        4.0  3.0  2.0  1.0  2.0
o        5.0  4.0  3.0  2.0  2.0
t        6.0  5.0  4.0  3.0  3.0

The distance between Elliot and Elia is 3.0


### Timing 

In [73]:
x = "EXPONENTIAL"
y = "POLYNOMIAL"

In [74]:
%%timeit
D = memoization_table_weighted(x,y)

186 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [75]:
%%timeit
nltk.edit_distance(x,y)

72 µs ± 272 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# Speeding up code

Simple example with cython

In [32]:
%load_ext cython

In [33]:
def fib(n):
    a = 0.
    b = 1.
    for i in range(n):
        a, b = a + b, a
    return a

In [34]:
%%cython --a
cpdef cy_fib(int n):
    cdef int i
    cdef float a=0.0, b=1.0
    for i in range(n):
        a, b = a + b, a
    return a

In [35]:
fib(10)

55.0

In [36]:
cy_fib(10)

55.0

In [14]:
import timeit

n_times = 100000
t_fib = timeit.timeit("fib(10)", setup="from __main__ import fib",number=n_times)
t_cyfib = timeit.timeit("cy_fib(10)", setup="from __main__ import cy_fib",number=n_times)
t_fib_unit = t_fib/n_times

t_cyfib      = timeit.timeit("cy_fib(10)", setup="from __main__ import cy_fib",number=n_times)
t_cyfib_unit = t_cyfib/n_times

print(" Python version took: {} sec\n Cython version took: {} sec\n Cython is {:.0f}x faster"\
      .format(t_fib, t_cyfib, t_fib/t_cyfib))

print("\n Python version 1 run took: {} sec\n Cython version took: {} sec\n Cython is {:.0f}x faster"\
      .format(t_fib_unit, t_cyfib_unit, t_fib_unit/t_cyfib_unit))

 Python version took: 0.059523759000512655 sec
 Cython version took: 0.003585041999940586 sec
 Cython is 17x faster

 Python version 1 run took: 5.952375900051266e-07 sec
 Cython version took: 3.585041999940586e-08 sec
 Cython is 17x faster


### Speeding up edit distance



##### Exercise fill in cy_create_memoization_table so that it returns the matrix filled to compute the edit distance

In [87]:
%%cython --a

import numpy as np
cimport cython

cpdef cy_create_memoization_table(str X, str Y):

    
    # TODO BEGIN ---------------------------
    
    
    
    # TODO END ---------------------------
    
    #return D

In [83]:
D1 = create_memoization_table(x,y)

In [84]:
D2 = cy_create_memoization_table(x,y)
D2 = np.asarray(D2)

In [85]:
t_create_memoization_table = timeit.timeit("x='exponential'; y='polynomial'; create_memoization_table(x,y)",
                                           setup="import numpy as np; from __main__ import create_memoization_table",
                                          number=5000)

In [86]:
t_cy_create_memoization_table = timeit.timeit("x='exponential'; y='polynomial'; cy_create_memoization_table(x,y)",
                                              setup="from __main__ import cy_create_memoization_table",
                                              number=5000)

In [87]:
t_nltk = timeit.timeit("x='exponential'; y='polynomial'; nltk.edit_distance(x,y)",
                        setup="import nltk ",
                        number=5000)

In [88]:
print(""" 
      Python version took: {} sec
      Cython version took: {} sec
      nltk   version took: {} sec
      Cython is {:.0f}x faster than python
      Cython is {:.0f}x faster than nltk
      """\
      .format(t_create_memoization_table, 
              t_cy_create_memoization_table,
              t_nltk, 
              t_create_memoization_table/t_cy_create_memoization_table,
              t_nltk/t_cy_create_memoization_table))


 
      Python version took: 0.8201289099997666 sec
      Cython version took: 0.010222451001027366 sec
      nltk   version took: 0.34801899699959904 sec
      Cython is 80x faster than python
      Cython is 34x faster than nltk
      


### Return to the experiment where we computed closest word


##### Exercise: Return the last component of the DynamicProgramming matrix containing the edit distance

In [91]:
%%cython --a

import numpy as np
cimport cython

cpdef edit_distance(str X, str Y):

    ### TODO BEGIN ----------------------------------
    
    
    ### TODO END ------------------------------------
    return dist


In [90]:
edit_distance("lik", "cat")

3

In [91]:
words = nltk.corpus.words.words()
len(words)

236736

In [92]:
%%time
mistake = "drauing" 
distances = []
for word in words:
    ed = nltk.edit_distance(mistake, word)
    distances.append(ed)
    
print("\nthe closest word is", words[np.argmin(distances)])


the closest word is drawing
CPU times: user 10.3 s, sys: 28.2 ms, total: 10.3 s
Wall time: 10.3 s


In [109]:
%%time
mistake = "drauing" 
cy_distances = []
for word in words:
    ed = editdistance.eval(mistake, word)
    cy_distances.append(ed)
    
print("\nthe closest word is", words[np.argmin(cy_distances)])


the closest word is drawing
CPU times: user 451 ms, sys: 1.81 ms, total: 453 ms
Wall time: 452 ms


In [14]:
%%time
mistake = "drauing" 
cy_distances = []
for word in words:
    ed = edit_distance(mistake, word)
    cy_distances.append(ed)
    
print("\nthe closest word is", words[np.argmin(cy_distances)])

NameError: name 'edit_distance' is not defined

In [116]:
distances == cy_distances

True

In [112]:
editdistance.eval("hi", "hi"), edit_distance("hi","hi")

(0, 0)

In [113]:
editdistance.eval("hi", "ho"), edit_distance("hi","ho")

(1, 1)



##### Interesting material on string similarities

Approximate string matching:

https://medium.com/@wolfgarbe/fast-approximate-string-matching-with-large-edit-distances-in-big-data-2015-9174a0968c0b

Levenshtein distance using a trie:

http://stevehanov.ca/blog/?id=114

About jaccard distance:

https://python.gotrained.com/nltk-edit-distance-jaccard-distance/


Nice work on string similarities:

http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.7158&rep=rep1&type=pdf

