In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import os
import re
import random
from tqdm import tqdm, trange

from keras import backend as K
from keras.models import Model
from keras.layers import Input,LSTM,Dense,Bidirectional 

print(os.listdir("../input"))


In [None]:
# File loading
df  = pd.read_excel('../input/finnish-last-names-20200206/sukunimitilasto-2020-02-06.xlsx',0)
print(df.shape)
df.dropna(axis=0,how='any')
print(df.shape)
print(df.columns)


In [None]:
names = [x for x in df['Sukunimi'] if type(x) == type('a') ]
print("Name Count:",len(names))
print(names[:4])

In [None]:
# Preprocessing
def process(sent):
    sent=sent.lower()
    # Data set is good enough so this is not needed
    sent=re.sub(r'[^-ü0-9a-öA-Ö ]','',sent)
    sent=re.sub(r'[ç]','c',sent)
    sent=re.sub(r'[ê]','e',sent)
    sent=sent.replace('\n','')
    return sent    

# [^0-9a-öA-Ö ] = Chage everything except 0-9 and a-ö
# Test preprocessing
a = "Von K0rhönúûünå-Virtánéênç"
b = process(a)
print(b)

In [None]:
names =[process(x) for x in names]
temp = []
for name in names:
    temp+= [ x for x in name.split() ]
# Find dublicates (Some last names can include one Ala-aho -> Matala-aho)
names = list(set(temp))
print("\n".join(names[:4]))
print("Number of items:",len(names))

In [None]:
# CHAR INDEXING
char_set = list(" aábcdéefghijklmnóopqrstüuvwxyzåäö0123456789-")
char2int = { char_set[x]:x for x in range(len(char_set)) }
int2char = { char2int[x]:x for x in char_set }
print(char2int)
print(int2char)

In [None]:
count = len(char_set)
codes = ["\t","\n",'#']
for i in range(len(codes)):
    code = codes[i]
    char2int[code]=count
    int2char[count]=code
    count+=1
print(char2int)
print(int2char)

In [None]:
#thresh - 0 to 1
# Create false last name 
def gen_gibberish(name,thresh=0.5):
    times = int(random.randrange(1,len(name)) * thresh)
    '''
    Types of replacement:
        1.Delete random character.
        2.Add random character.
        3.Replace a character.
        4.Combination?
    '''
    while times!=0:
        # try to gen noise length times...
        times-=1
        val = random.randrange(0,10)
        if val <= 5:
            #get random index
            val = random.randrange(0,10)
            index = random.randrange(2,len(name))
            if val <= 6 :
                #delete character
                name = name[:index]+name[index+1:]
            else:
                #add character
                insert_index = random.randrange(0,len(char_set))
                name = name[:index] + char_set[insert_index] + name[index:]
        else:
            #add character
            index = random.randrange(0,len(char_set))
            replace_index = random.randrange(2,len(name))
            name = name[:replace_index] + char_set[index] + name[replace_index+1:]
    return name

sample = names[6]
gib = gen_gibberish(sample)
print("Original:",sample)
print("Gibberish:",gib)
        
    

In [None]:
# create dataset
input_texts = []
target_texts = []
validation_texts = []
val_target_texts = []
REPEAT_FACTOR = 33

for name in names:
    output_text = '\t' + name + '\n'
    # Create the training instance 
    for _ in range(REPEAT_FACTOR):
        input_text = gen_gibberish(name)
        input_texts.append(input_text)
        target_texts.append(output_text)
    # Create the testing instance 
    val_text = gen_gibberish(name)
    validation_texts.append(val_text)
    val_target_texts.append(output_text)
    
print("LEN OF SAMPLES:",len(input_texts))
print("LEN OF VALIDATION SAMPLES:", len(validation_texts))

In [None]:
max_enc_len = max([len(x) for x in input_texts])
max_dec_len = max([len(x) for x in target_texts])
print("Max Enc Len:",max_enc_len)
print("Max Dec Len:",max_dec_len)

# For the validation
val_enc_len = max([len(x) for x in validation_texts])
val_dec_len = max([len(x) for x in val_target_texts])
print("Max VAL Enc Len:",val_enc_len)
print("Max VAL Dec Len:",val_dec_len)


In [None]:
num_samples = len(input_texts)
num_val_samples = len(validation_texts)
encoder_input_data = np.zeros( (num_samples , max_enc_len , len(char_set)),dtype='float32' )
decoder_input_data = np.zeros( (num_samples , max_dec_len , len(char_set)+2),dtype='float32' )
decoder_target_data = np.zeros( (num_samples , max_dec_len , len(char_set)+2),dtype='float32' )

# For the validation
val_encoder_input_data = np.zeros( (num_val_samples , max_enc_len , len(char_set)),dtype='float32' )
val_decoder_input_data = np.zeros( (num_val_samples , max_dec_len , len(char_set)+2),dtype='float32' )
val_decoder_target_data = np.zeros( (num_val_samples , max_dec_len , len(char_set)+2),dtype='float32' )

print("CREATED ZERO VECTORS")

In [None]:
#filling in the enc,dec datas
    # Encode the chracters to binary ints based on dictionary
for i,(input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    for t,char in enumerate(input_text):
        encoder_input_data[ i , t , char2int[char] ] = 1
    for t,char in enumerate(target_text):
        decoder_input_data[ i, t , char2int[char] ] = 1
        if t > 0 :
            decoder_target_data[ i , t-1 , char2int[char] ] = 1
            
# For the validation set 
for i,(validation_text,val_target_text) in enumerate(zip(validation_texts,val_target_texts)):
    for t,char in enumerate(validation_text):
        val_encoder_input_data[ i , t , char2int[char] ] = 1
    for t,char in enumerate(val_target_text):
        val_decoder_input_data[ i, t , char2int[char] ] = 1
        if t > 0 :
            val_decoder_target_data[ i , t-1 , char2int[char] ] = 1
            
print(decoder_target_data.shape)
print(val_decoder_target_data.shape)
print("COMPLETED...")    

In [None]:
batch_size = 128
#batch_size = 1024
epochs = 20
latent_dim = 256

num_enc_tokens = len(char_set)
num_dec_tokens = len(char_set) + 2 # includes \n \t

# THE ENCODER 
encoder_inputs = Input(shape=(None,num_enc_tokens))
encoder = LSTM(latent_dim,return_state=True)
encoder_outputs , state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h,state_c]


In [None]:
# THE DECODER 
decoder_inputs = Input(shape=(None,num_dec_tokens))
decoder_lstm = LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_ouputs,_,_ = decoder_lstm(decoder_inputs,initial_state = encoder_states)

decoder_dense = Dense(num_dec_tokens, activation='softmax')
decoder_ouputs = decoder_dense(decoder_ouputs)

model = Model([encoder_inputs,decoder_inputs],decoder_ouputs)

model.compile(optimizer='rmsprop',loss='categorical_crossentropy')
model.summary()

In [None]:
h=model.fit([encoder_input_data,decoder_input_data],decoder_target_data
         ,epochs = epochs,
          batch_size = batch_size,
          validation_data=([val_encoder_input_data,val_decoder_input_data],val_decoder_target_data)
         )
model.save('s2s.h5')


In [None]:
plt.plot(h.history['loss'])
plt.plot(h.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
encoder_model = Model(encoder_inputs,encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]
decoder_outputs,state_h,state_c = decoder_lstm(
        decoder_inputs,initial_state = decoder_states_inputs
)
decoder_states = [state_h,state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)
encoder_model.save('encoder.h5')
decoder_model.save('decoder.h5')


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_dec_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, char2int['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = int2char[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_dec_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_dec_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Loop and wait for the user to input last names
new_name = ''
while new_name != 'quit':
    # Ask the user for a name.
    new_name = input("Input a last name or enter 'quit': ")
    new_name=new_name.lower()
    
    input_name = np.zeros( (1 , max_enc_len , len(char_set)),dtype='float32' )
    for t,char in enumerate(new_name):
        input_name[ 0 , t , char2int[char] ] = 1
    
    decoded_name = decode_sequence(input_name)
    
    decoded_name = decoded_name.split('\n')[0]
    
    if new_name == decoded_name:
        print("The name is correct!")
    else:
        print("You wrote: ",new_name, ", Did you mean: ", decoded_name)
    
    print("-")


In [None]:
# Loop through validation set and count the binary accuracy
true = 0
amount = 0

for idx in range(val_encoder_input_data.shape[0]):
    name = val_encoder_input_data[idx:idx+1]
    decoded_n = decode_sequence(name)
    
    decoded_n = "\t" + decoded_n 
    
    if decoded_n == val_target_texts[idx]:
        true += 1
    amount += 1
    
    if not(idx % 1000):
        print(idx, " validation names processed...")
    
acc = true/amount
print("The binary accuracy of the model: ", acc)