# Correspondencia en direcciones postales

In [2]:
import random
import string
import numpy as np
import tensorflow as tf

In [3]:
n = 10
street_names = ["diagon", "elm", "abbey", "gran", "python"]
street_type = ["callejon", "calle", "carrera", "via", "avenida"]
street_zips = [random.randint(20000, 29999) for i in range(5)]
numbers = [random.randint(1,999) for i in range(n)]

In [35]:
streets = [random.choice(street_names) for i in range(n)]
street_prefs = [random.choice(street_type) for i in range(n)]
zips = [random.choice(street_zips) for i in range(n)]
full_streets = [x + " " + y + " " + str(z) for x,y,z in zip(street_prefs, streets, numbers)]
reference_data = [list(x) for x in zip(full_streets, zips)]

In [36]:
reference_data

[['callejon abbey 2308', 27827],
 ['avenida canal 709', 29404],
 ['carrera baker 2273', 27460],
 ['callejon elm 1843', 27460],
 ['via elm 8769', 25953],
 ['carrera elm 3798', 24746],
 ['carrera canal 2288', 25953],
 ['calle abbey 2416', 29404],
 ['callejon baker 543', 25953],
 ['via baker 994', 25953]]

In [37]:
def create_typo(s, prob=0.75):
    if random.uniform(0,1)<0.75:
        rand_idx = random.choice(range(len(s)))
        s_list = list(s)
        s_list[rand_idx] = random.choice(string.ascii_lowercase)
        s = ''.join(s_list)
    return s

In [38]:
typo_streets = [create_typo(x) for x in streets]

In [39]:
typo_full_streets = [x+" "+y+" "+str(z) for x,y,z in zip(street_prefs, typo_streets, numbers)]
test_data = [list(x) for x in zip(typo_full_streets, zips)]

In [40]:
test_data

[['callejon abbey 2308', 27827],
 ['avenida cwnal 709', 29404],
 ['carrera baker 2273', 27460],
 ['callejon plm 1843', 27460],
 ['via eli 8769', 25953],
 ['carrera elm 3798', 24746],
 ['carrera canal 2288', 25953],
 ['calle abbeb 2416', 29404],
 ['callejon baker 543', 25953],
 ['via bakew 994', 25953]]

In [41]:
session = tf.Session()

In [42]:
test_address = tf.sparse_placeholder(dtype = tf.string)
test_zip = tf.placeholder(shape = [None, 1], dtype=tf.float32)

ref_address = tf.sparse_placeholder(dtype = tf.string)
ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32 )

In [43]:
zip_dist = tf.square(tf.subtract(ref_zip, test_zip))
address_dist = tf.edit_distance(test_address, ref_address, normalize=True)

- $S(x,y) = 0$ si $x$ e $y$ son totalmente diferentes (no se parecen en nada)
- $S(x,x) = 1$, ya que todo objeto es similar (si no igual) a si mismo.
- $S(x,y) = \frac{D - d(x,y)}{D-d}$ donde $D$ es la mayor distancia entre dos objetos posibles, y $d$ es la menor.

In [44]:
zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))
zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))
zip_sim = tf.divide(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))

In [45]:
address_sim = tf.subtract(1.0, address_dist)

$$S(x,y) = \sum_{i=1}^k w_iS_k(x,y):\quad \sum_{i=1}^kw_i = 1$$

In [46]:
address_wi = 0.5
zip_wi = 1.0 -address_wi

In [47]:
weighted_sim = tf.add(tf.transpose(tf.multiply(address_wi, address_sim)),tf.multiply(zip_wi, zip_sim))

In [48]:
top_match_idx = tf.argmax(weighted_sim, 1)

In [49]:
def sparse_from_word_vector(word_vector):
    num_words = len(word_vector)
    idx = [[xi, 0, yi] for xi, x in enumerate(word_vector) for yi, y in enumerate(x)]
    chars = list(''.join(word_vector))
    return tf.SparseTensorValue(idx, chars, [num_words,1,1])

In [53]:
reference_address = [x[0] for x in reference_data]
reference_zips = np.array([[x[1] for x in reference_data]])

In [54]:
sparse_ref_set = sparse_from_word_vector(reference_address)

In [56]:
for i in range(n):
    test_address_entry = test_data[i][0]
    test_zip_entry = [[test_data[i][1]]]
    
    test_address_rep = [test_address_entry]*n
    sparse_test_set = sparse_from_word_vector(test_address_rep)
    
    feed_dict = {test_address: sparse_test_set,
                test_zip: test_zip_entry,
                ref_address: sparse_ref_set,
                ref_zip: reference_zips}
    
    best_match = session.run(top_match_idx, feed_dict=feed_dict)
    best_address = reference_address[best_match[0]]
    [best_zip] = reference_zips[0][best_match]
    [[test_zip_aux]] = test_zip_entry
    
    print("Dirección original: "+str(test_address_entry)+ ", "+str(test_zip_aux))
    print("Dirección corregida: "+str(best_address)+", "+str(best_zip)+"\n")

Dirección original: callejon abbey 2308, 27827
Dirección corregida: callejon abbey 2308, 27827

Dirección original: avenida cwnal 709, 29404
Dirección corregida: avenida canal 709, 29404

Dirección original: carrera baker 2273, 27460
Dirección corregida: carrera baker 2273, 27460

Dirección original: callejon plm 1843, 27460
Dirección corregida: callejon elm 1843, 27460

Dirección original: via eli 8769, 25953
Dirección corregida: via elm 8769, 25953

Dirección original: carrera elm 3798, 24746
Dirección corregida: carrera elm 3798, 24746

Dirección original: carrera canal 2288, 25953
Dirección corregida: carrera canal 2288, 25953

Dirección original: calle abbeb 2416, 29404
Dirección corregida: calle abbey 2416, 29404

Dirección original: callejon baker 543, 25953
Dirección corregida: callejon baker 543, 25953

Dirección original: via bakew 994, 25953
Dirección corregida: via baker 994, 25953

