In [1]:
import json
import os
import utils
import torch
import torch.nn.functional as F

model_dir = "experiments/"
json_path = os.path.join(model_dir, 'params.json')
params = utils.Params(json_path)
params.device = torch.device(params.device)

In [3]:
from model.data_loader import DataLoader

data_dir = "data/full_version/"
data_loader = DataLoader(data_dir, params)
params.vocab_size = len(data_loader.BABYNAME.vocab)

vocab built


In [4]:
import pandas as pd

_train = pd.read_csv(os.path.join(data_dir, "train/train_dataset.csv"))
_val = pd.read_csv(os.path.join(data_dir, "val/val_dataset.csv"))
total_df = pd.concat([_train, _val], 0)

In [5]:
import model.charRNN as net
model = net.Net(params).to(params.device)

In [6]:
weight_path = os.path.join(model_dir, "best.pth.tar")
checkpoint = utils.load_checkpoint(weight_path, model)

In [7]:
from collections import OrderedDict, defaultdict

In [8]:
data_loader.SEX.process(['boy'])

tensor([[ 0,  1]])

In [9]:
data_loader.SEX.process(['girl'])

tensor([[ 1,  0]])

In [26]:
def sample(net, prime, category):
    
    category_tensor = data_loader.SEX.process([category]).float()
    
    prime = prime.lower()
    prime_tensor = data_loader.BABYNAME.process([prime])[:, :-1]
    bsz, prime_tensor_length = prime_tensor.size()
    
    net.eval()
    hidden = net.init_hidden(1)
    
    for step in range(prime_tensor_length):
        with torch.no_grad():
            outputs, hidden = net(category_tensor, prime_tensor[:, step], hidden)
        probabilities = F.softmax(outputs, 1)
    
    return torch.log(probabilities.squeeze()), hidden

In [27]:
def clean_beam_basket(basket, beam_width):
    _tmp_basket = basket.copy()
    to_remove = sorted(_tmp_basket.items(), key=lambda x: x[1], reverse=True)[beam_width:]
    for item in to_remove:
        _tmp_basket.pop(item[0])
        
    return _tmp_basket

In [28]:
import numpy as np

In [29]:
def evaluate_creativity(name_list, dataset):
    for a_name in name_list:
        if dataset.babyname.str.contains(a_name).sum() > 0:
            verdict = "is already in the dataset"
        else:
            verdict = "is a new name!"
        print("name: {} ".format(a_name) + verdict)

In [40]:
def beam_search(net, beam_width, alpha, prime="A", category="boy"):
    print("Sampling a {} name beginning with {}..".format(category, prime))
        
    beam_basket = OrderedDict()
    beam_basket[prime] = 0.0
    
    
    counter = 0
    while True:
        counter += 1
        
        # 바스켓을 청소한다.
        beam_basket = clean_beam_basket(beam_basket, beam_width)
        
        # 만약 바스켓에 모든 아이템이 <eos>가 있으면 루프를 멈춘다.
        eos_cnt = 0
        for k in beam_basket.keys():
            if "<eos>" in k:
                eos_cnt += 1
        if eos_cnt == beam_width:
            print("all items have <eos>")
            break
            
        # 모든 key를 돌면서
        ## <eos>가 없는 경우 inference를 한다.
        new_entries = {}
        to_remove = []
        for k in beam_basket.keys():
            if "<eos>" not in k:
                probabilities, hidden = sample(net, prime=k, category=category)
                for ix, prob in enumerate(probabilities):
                    new_k = k + data_loader.BABYNAME.vocab.itos[ix]
                    added_probability = beam_basket[k] + prob.item()
                    len_k = len(k.replace("<eos>", ""))
                    normalized_probability = (1 / (len(k) ** alpha)) * added_probability 
                    new_entries[new_k] = normalized_probability
                to_remove.append(k)
        # 그리고 기존 key를 beam_basket에서 지운다.
        for k in to_remove:
            beam_basket.pop(k)
        
        for k, v in new_entries.items():
            beam_basket[k] = v
            
    final_list = []
    for k, v in beam_basket.items():
        refined_k = k.replace("<eos>", "").capitalize()
        final_list.append(refined_k)
        final_prob = np.exp(v)
                
    return final_list
            
        
        
    
# final_list = beam_search(model, prime='A', category='boy')

In [41]:
def generate_names(net, prime, category, beam_width, dataset, alpha=0.7):
    final_list = beam_search(net, prime=prime, category=category, beam_width=beam_width, alpha=alpha)
    
    evaluate_creativity(final_list, dataset)
    
generate_names(model, prime="A", category="girl", beam_width=3, dataset=total_df)
generate_names(model, prime="A", category="boy", beam_width=3, dataset=total_df)

Sampling a girl name beginning with A..
all items have <eos>
name: Annessa is a new name!
name: Allissa is already in the dataset
name: Allissandre is a new name!
Sampling a boy name beginning with A..
all items have <eos>
name: Andrio is a new name!
name: Arnell is already in the dataset
name: Andrick is already in the dataset


In [42]:
import string

In [43]:
for sex in ['boy', 'girl']:
    for char in string.ascii_uppercase:
        generate_names(model, prime=char, category=sex, beam_width=3, dataset=total_df)
        print(" ")

Sampling a boy name beginning with A..
all items have <eos>
name: Andrio is a new name!
name: Arnell is already in the dataset
name: Andrick is already in the dataset
 
Sampling a boy name beginning with B..
all items have <eos>
name: Branly is already in the dataset
name: Branley is already in the dataset
name: Branleigh is a new name!
 
Sampling a boy name beginning with C..
all items have <eos>
name: Carles is a new name!
name: Carlis is already in the dataset
name: Carlison is a new name!
 
Sampling a boy name beginning with D..
all items have <eos>
name: Derick is already in the dataset
name: Derrik is already in the dataset
name: Derrick is already in the dataset
 
Sampling a boy name beginning with E..
all items have <eos>
name: Ellfred is already in the dataset
name: Everley is already in the dataset
name: Ellfredge is a new name!
 
Sampling a boy name beginning with F..
all items have <eos>
name: Farren is a new name!
name: Farris is already in the dataset
name: Fabiano is alr

all items have <eos>
name: Winora is a new name!
name: Willee is already in the dataset
name: Willie is already in the dataset
 
Sampling a girl name beginning with X..
all items have <eos>
name: Xandria is already in the dataset
name: Xandrina is a new name!
name: Xandrine is a new name!
 
Sampling a girl name beginning with Y..
all items have <eos>
name: Yannee is a new name!
name: Yolanda is already in the dataset
name: Yolande is already in the dataset
 
Sampling a girl name beginning with Z..
all items have <eos>
name: Zandra is already in the dataset
name: Zorina is already in the dataset
name: Zandria is a new name!
 


In [44]:
for sex in ['boy', 'girl']:
    for char in string.ascii_uppercase:
        generate_names(model, prime=char, category=sex, beam_width=3, dataset=total_df, alpha=1.0)
        print(" ")

Sampling a boy name beginning with A..
all items have <eos>
name: Andrio is a new name!
name: Arnell is already in the dataset
name: Andrick is already in the dataset
 
Sampling a boy name beginning with B..
all items have <eos>
name: Branly is already in the dataset
name: Bartham is a new name!
name: Branley is already in the dataset
 
Sampling a boy name beginning with C..
all items have <eos>
name: Carles is a new name!
name: Carlis is already in the dataset
name: Carlison is a new name!
 
Sampling a boy name beginning with D..
all items have <eos>
name: Derick is already in the dataset
name: Derrik is already in the dataset
name: Derrick is already in the dataset
 
Sampling a boy name beginning with E..
all items have <eos>
name: Ellfred is already in the dataset
name: Everley is already in the dataset
name: Ellfredge is a new name!
 
Sampling a boy name beginning with F..
all items have <eos>
name: Fabio is already in the dataset
name: Fabion is already in the dataset
name: Fabian

all items have <eos>
name: Xandria is already in the dataset
name: Xandrina is a new name!
name: Xandrine is a new name!
 
Sampling a girl name beginning with Y..
all items have <eos>
name: Yannee is a new name!
name: Yolanda is already in the dataset
name: Yolande is already in the dataset
 
Sampling a girl name beginning with Z..
all items have <eos>
name: Zandra is already in the dataset
name: Zorina is already in the dataset
name: Zandria is a new name!
 


In [47]:
for sex in ['boy', 'girl']:
    for char in string.ascii_uppercase:
        generate_names(model, prime=char, category=sex, beam_width=10, dataset=total_df, alpha=0.0)
        print(" ")

Sampling a boy name beginning with A..
all items have <eos>
name: Andro is already in the dataset
name: Andrio is a new name!
name: Alfino is a new name!
name: Arnald is already in the dataset
name: Arnell is already in the dataset
name: Andrick is already in the dataset
name: Allford is already in the dataset
name: Aldrich is already in the dataset
name: Aldrick is a new name!
name: Alfonzio is a new name!
 
Sampling a boy name beginning with B..
all items have <eos>
name: Branly is already in the dataset
name: Branley is already in the dataset
name: Branden is already in the dataset
name: Brentin is already in the dataset
name: Brently is already in the dataset
name: Briston is a new name!
name: Brandell is already in the dataset
name: Brandley is a new name!
name: Brentley is already in the dataset
name: Branleigh is a new name!
 
Sampling a boy name beginning with C..
all items have <eos>
name: Colly is already in the dataset
name: Cristo is already in the dataset
name: Colley is a

name: Theod is already in the dataset
name: Terrel is already in the dataset
name: Trevan is already in the dataset
name: Terrell is already in the dataset
name: Timorio is a new name!
name: Torrell is a new name!
name: Theodor is already in the dataset
name: Thorston is a new name!
name: Timotheus is already in the dataset
 
Sampling a boy name beginning with U..
all items have <eos>
name: Urie is already in the dataset
name: Ursh is a new name!
name: Ullis is a new name!
name: Urian is a new name!
name: Uriel is already in the dataset
name: Urrie is a new name!
name: Urson is already in the dataset
name: Uriano is a new name!
name: Uriell is already in the dataset
name: Ursell is already in the dataset
 
Sampling a boy name beginning with V..
all items have <eos>
name: Vitor is already in the dataset
name: Valler is already in the dataset
name: Vicker is a new name!
name: Valerio is already in the dataset
name: Vitorio is already in the dataset
name: Valentio is a new name!
name: Val

all items have <eos>
name: Marcee is already in the dataset
name: Marcie is already in the dataset
name: Melissa is already in the dataset
name: Michell is already in the dataset
name: Marielle is already in the dataset
name: Marcelle is already in the dataset
name: Marcelyn is already in the dataset
name: Michelle is already in the dataset
name: Mariselle is a new name!
name: Marcelynn is a new name!
 
Sampling a girl name beginning with N..
all items have <eos>
name: Nora is already in the dataset
name: Nicola is already in the dataset
name: Nikola is already in the dataset
name: Nicolia is a new name!
name: Nicolle is already in the dataset
name: Nickola is already in the dataset
name: Nicoleta is a new name!
name: Nicoline is already in the dataset
name: Nicoletta is already in the dataset
name: Nicolette is already in the dataset
 
Sampling a girl name beginning with O..
all items have <eos>
name: Oria is already in the dataset
name: Ondra is already in the dataset
name: Orelia is

In [None]:
generate_names(model, prime="A", category="boy", beam_width=3, dataset=total_df)

In [None]:
generate_names(model, prime="Z", category="boy", beam_width=3, dataset=total_df)

In [None]:
generate_names(model, prime="Z", category="girl", beam_width=3, dataset=total_df)

In [None]:
total_df.babyname.str.contains("Andrick").sum()

In [None]:
total_df[lambda x: x.babyname == "Andrian"]

In [None]:
a = defaultdict()

In [None]:
print(a.get('a'))

In [None]:
beam_basket = OrderedDict()
beam_basket['A'] = 1.0
beam_basket['B'] = 0.8
beam_basket['C'] = 0.7
beam_basket['D'] = 1.2

In [None]:
sorted(beam_basket.items(), key=lambda x: x[1], reverse=True)

In [None]:
data_loader.BABYNAME.vocab.stoi