## Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools

df = pd.read_csv("addresses.csv", encoding = "ISO-8859-1")
df = df.sample(frac=1).reset_index(drop=True) #Shuffle data
df.head(10)

Unnamed: 0,Name,Street,City,Postcode,State
0,EASY WAY CONSTRUCTIONS PTY LTD,23 SANDPIPER DR,MIDWAY POINT,7171,TAS
1,DAWSON CONSTRUCTION QLD PTY LTD,7 NELSON CRT,BENOWA,4217,QLD
2,VICTORIA POLICE,LEVEL 11/313 SPENCER ST,DOCKLANDS,3008,VIC
3,MICK ROSE PLUMBING,UNIT 2-25 BAYVISTA RISE,SOMERVILLE,3912,VIC
4,RAYMOND C POBJOY & ASSOCIATES,4/868 MALVERN RD,ARMADALE,3143,VIC
5,PKC ELECTRICAL,33 MADELINE ST,MUDGEERABA,4213,QLD
6,SEATONS ALUMINIUM,91 MAITLAND RD,SANDGATE,2304,NSW
7,JACARANDA MOTOR LODGE,PO BOX 230,GRAFTON,2460,NSW
8,DTS BUILDERS,PO BOX 1649 MILTON,MILTON,4064,QLD
9,DARYL ANSELL,7 COOLUM CL,KEWARRA BEACH,4879,QLD


## Unique States

In [None]:
df.State.unique()

## Unique Cities

In [None]:
city_list = [text.strip() for text in df.City.unique().tolist()]

In [None]:
df_cities = pd.DataFrame(city_list)
len(df_cities)

## Unique streets

In [None]:
street_list = [text.strip() for text in df.Street.unique().tolist()]

In [None]:
df_streets = pd.DataFrame(street_list,columns=["street"])
df_streets.iloc[5000]

## Unique buildings

In [None]:
building_list = [str(text).strip() for text in df.Name.unique().tolist()]
df_buildings = pd.DataFrame(building_list,columns=["bulding"])
df_buildings.head(100)

## Helper functions to encode/decode vocabs

In [2]:
import string

# Allowable characters for the encoded representation
chars = list(string.digits + string.ascii_lowercase + string.punctuation + string.whitespace)

def chars_encode(characters: str) ->np.ndarray:
    """
    Converts a string into a list of vocab indices
    :param characters: the string to convert
    :return: the an array of vocab indices
    """
    result = list()
    for c in characters.lower():
        try:
            result.append(chars.index(c))
        except ValueError:
            result.append(0)
    return np.array(result, dtype=np.int64)

def chars_decode(vocab_indices : np.ndarray)->str:
    result = []
    for n in vocab_indices:
        result.append(chars[n])
    
    return "".join(result)

print(len(chars))

74


## Label Generation Function

In [3]:
labels_list = [
    "building",
    'street',  
    'city',  
    'postcode',  
    'state',  
    'blank'
]
n_labels = len(labels_list)

## Label Generation Function

In [103]:
# TODO: Label generation function 
# Add a bit of randonmess (Typos, commas, random sentences in front and behind address, etc..)
# y = { street, postcode, city, state, unknown  }. eg y = {1,0,0,0,0} ->  street
import random

def generate_label(text: str, field_name: str) -> (str,np.ndarray):
    """
    """
    labels_vector = np.zeros((len(text), n_labels), dtype=np.float32)
    labels_vector[:, labels_list.index(field_name)] = 1
    return (text,labels_vector)

def generate_address(building:str, 
                     street:str, 
                     city:str, 
                     postcode:str, 
                     state: str) -> (str, np.ndarray):
    """
    """
    building_label = generate_label(building,"building")
    street_label = generate_label(street,"street")
    city_label = generate_label(city,"city")
    postcode_label= generate_label(postcode, "postcode")
    state_label = generate_label(state, "state")
    concat_text = []
    concat_label = []
    # Chance of inserting seperator between text
    sep = random.choice(["\n",","," "])
    sep_label = generate_label(sep,'blank')
    suburb_state_code = [city_label, postcode_label, state_label]
    random.shuffle(suburb_state_code)
    address_vector = [street_label] + suburb_state_code
    if building_label:
         address_vector = [building_label] + address_vector
    for i in range(0,len(address_vector)) :
        each = address_vector[i]
        concat_text.append(each[0])
        concat_label.append(each[1])
        if (i < len(address_vector)-1):
            concat_text.append(sep_label[0])
            concat_label.append(sep_label[1])  
    merged_text = "".join(concat_text)

    merged_labels = np.concatenate(concat_label, axis=0)
    return merged_text,merged_labels

def to_category(labels: np.ndarray):
    label_names = []
    for i in range(len(labels)):
        each = labels[i]
        idx = each.tolist().index(True)
        label_names.append(idx)
    return label_names
                     
address = generate_address("test","16 colville cresc", "keysborough", "3173", "vic")
address[1]
#print(to_category(address[1]))
#print(chars_encode(address[0]))

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0.

## Generate Data (Dev, Test, Train)

In [182]:
# TODO: Shuffle data into dev, test, train (1K,1K,30K)
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

x_test = []
y_test = []
x_train = []
y_train = []
for index, row in df.iterrows():
    building = str(row.Name).strip()
    street = row.Street.strip()
    city = row.City.strip()
    postcode = row.Postcode.strip()
    state = row.State.strip()
    if random.random() < 0.5:
        building = ""
    if index < 3000:
        address = generate_address(building, street, city, postcode, state)
        address_encoded = chars_encode(address[0])
        labels = address[1]
        x_test.append(address_encoded)
        y_test.append(labels)
    elif index < 33000:
        address = generate_address(building, street, city, postcode, state)
        address_encoded = chars_encode(address[0])
        labels = address[1]
        x_train.append(address_encoded)
        y_train.append(labels)
    else:
        break
    

print("Size of x test:", len(x_test))
print("Size of y test:", len(y_test))
print("Size of x train:", len(x_train))
print("Size of y train:", len(y_train))

x_train = pad_sequences(x_train, padding='post')
y_train = pad_sequences(y_train, padding='post')
x_test = pad_sequences(x_test, padding='post')
y_test = pad_sequences(y_test, padding='post')







Size of x test: 3000
Size of y test: 3000
Size of x train: 30000
Size of y train: 30000


## Global Parameters

In [197]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, TimeDistributed
from keras.models import Sequential
from keras.models import load_model, Model
from keras.utils import to_categorical

# Max number of characters in sentence
batch_size = 32
data_dim = 74
nb_classes = 6

model = Sequential()
model.add(Embedding(74, 64, input_length=None, mask_zero=True))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=True,)))
model.add(TimeDistributed(Dense(6, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])

Train...
Train on 30000 samples, validate on 3000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x64b1d3b50>

In [227]:
address = generate_address("Plumbing kenny","16 colville cresc", "keysborough", "3173", "VIC")
x_text = address[0]
print(x_text)
x_encoded = chars_encode(x_text)
x_encoded = x_encoded.reshape(1,len(x_encoded))
predict = model.predict_classes(x_encoded,verbose=0)
predict
i = 0
for p in predict:
    for x in p:
        print(x_text[i],'->',labels_list[x])
        i+=1

Plumbing kenny 16 colville cresc keysborough VIC 3173
P -> building
l -> building
u -> building
m -> building
b -> building
i -> building
n -> building
g -> building
  -> building
k -> building
e -> building
n -> building
n -> building
y -> building
  -> blank
1 -> street
6 -> street
  -> street
c -> street
o -> street
l -> street
v -> street
i -> street
l -> street
l -> street
e -> street
  -> street
c -> street
r -> street
e -> street
s -> street
c -> street
  -> blank
k -> city
e -> city
y -> city
s -> city
b -> city
o -> city
r -> city
o -> city
u -> city
g -> city
h -> city
  -> blank
V -> state
I -> state
C -> state
  -> blank
3 -> postcode
1 -> postcode
7 -> postcode
3 -> postcode


## Save model and weights

In [225]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [42]:

# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

[[[ 1]
  [ 2]
  [ 3]
  [ 4]
  [ 5]
  [ 6]
  [ 7]
  [ 8]
  [ 9]
  [10]]

 [[ 2]
  [ 3]
  [ 4]
  [ 5]
  [ 6]
  [ 7]
  [ 8]
  [ 9]
  [10]
  [11]]]
