In [30]:
import torch
from tqdm import tqdm
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

import pandas as pd
import numpy as np

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
from keras.preprocessing.sequence import pad_sequences
logging.basicConfig(level=logging.INFO)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [31]:
# Load pre-trained model tokenizer (vocabulary)
t = GPT2Tokenizer.from_pretrained('gpt2')

book = pd.read_csv('../data/raw/BookRestaurant/BookRestaurant.csv')
data = pd.DataFrame()

with_names = book[ [False if x else True for x in pd.isna(book['restaurant_name'])] ]
data['tokens'] = with_names['text'].apply(t.encode).apply(np.array)
data['t_tokens'] = with_names['restaurant_name'].apply(t.encode).apply(np.array)
data['names'] = with_names['restaurant_name']
data.head()

INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at C:\Users\dhruv\.pytorch_pretrained_bert\f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at C:\Users\dhruv\.pytorch_pretrained_bert\d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda


Unnamed: 0,tokens,t_tokens,names
0,"[2070, 383, 6046, 3687, 220, 7072, 287, 3268, ...","[464, 6046, 3687]",The Middle East
1,"[10482, 257, 3084, 379, 309, 12, 47389, 220, 1...","[51, 12, 47389]",T-Rex
3,"[40, 423, 257, 2151, 286, 1440, 287, 2869, 290...","[49, 12078, 2584, 12, 42, 669, 461, 2364, 1453...",Rimsky-Korsakoffee House
11,"[29688, 1492, 257, 2119, 287, 1338, 35812, 459...","[4561, 35812, 45927]",Spaghetti Warehouse
18,"[10482, 257, 3084, 329, 838, 661, 379, 5648, 7...","[30128, 7957, 9892, 12946, 2097, 12696]",Dunbrody Country House Hotel


In [3]:
# data['tokens'] = pad_sequences(data['tokens'].values)
# data['t_tokens'] = pad_sequences(data['t_tokens'].values)
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
_filter = lambda x: " ".join([word for word in x.lower().split() if word not in stopwords.words('english')])
data['names'] = data['names'].apply(_filter)
data['names']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0                                             middle east
1                                                   t-rex
3                                rimsky-korsakoffee house
11                                    spaghetti warehouse
18                           dunbrody country house hotel
23      gus stevens seafood restaurant & buccaneer lounge
24                                       boon brick store
34                                                  oriel
51                                                ledbury
55                                                 ad hoc
60                                    great house sonning
62                               eighth step coffee house
77                                       theme restaurant
80                                          fashion cafã©
82                                                  t-rex
92                                    hurley mountain inn
96                                 coffee bean & tea leaf
98            

In [32]:
# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()


# If you have a GPU, put everything on cuda
tokens = [torch.tensor([x]).long().to(device) for x in (data['tokens'].values)]
question = torch.tensor([t.encode("Which restaurant?")]).long().to(device)
preds = []
model.to(device)

for token in tqdm(tokens):
    with torch.no_grad():
        predictions_1, past = model(token)
        predictions_2, past = model(question, past=past)
        pred = torch.argmax(predictions_1, dim=2)[0].cpu().numpy().tolist()
        preds.append(t.decode(pred))        

INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin from cache at C:\Users\dhruv\.pytorch_pretrained_bert\4295d67f022061768f4adc386234dbdb781c814c39662dd1662221c309962c55.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
INFO:pytorch_pretrained_bert.modeling_gpt2:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at C:\Users\dhruv\.pytorch_pretrained_bert\4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_pretrained_bert.modeling_gpt2:Model config {
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "vocab_size": 50257
}

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [6]:
preds = [_filter(x) for x in preds]
preds

['. new east: . newg $',
 ". the-mobile's the-y's.",
 '. lot friends the, party. theokayokurakuch... the. 1 15rd.',
 '. room theald junction. foodep. ori, othery.ugsza. the:.',
 ". minutes. abaroke's club. dunath.,",
 ". book fr bookood store baraneer's new guinea. $ of.",
 '. few-ie,. $ kidskids. theville.',
 ". bookatmealian the's $ friends. .",
 ". new newger. theth'clock.",
 '. favor. arienoc, the. week friends-',
 '. thegrade, table. friends end hall theoma.',
 'drink and... village, a.',
 '. get flight park week. the.',
 '. bookablees©s years,',
 '. new the-mobile. months thei.',
 ". the. theaffles, othertt. theon's ranch. theuary 1 sameth.",
 '. years the. university shop. tea club',
 '. thelab. londonisbury, myiked',
 '. the.',
 ". seat your-. houseicle's, theer,",
 ". months newist's, theor.",
 '. new. theiys. weekend. town . theraweather need',
 '.ing tea the,.',
 ". new the's€�'s�� houseic. the.",
 ". the's's. theham, bucks.",
 '.. project-, . hours. $.i..',
 "know flight at

In [33]:
def get_acc(preds, actual):
    acc = []
    for x, y in zip(preds, actual):
        found = 0
        for word in y:
            if word in x:
                found = found + 1
        acc.append(found / len(y))
    return np.array(acc)

In [34]:
np.mean(get_acc(preds, data['names'].values))

0.6860644550778606

In [35]:
np.mean(get_acc(data['names'].values, preds))

0.5474567079737621

In [None]:
indexed_tokens_1

In [None]:
book['restaurant_name']

In [None]:
with_names['restaurant_name']

In [None]:
book.columns

In [None]:
torch.tensor([torch.tensor(x).long() for x in (data['tokens'].values)])

In [None]:
t.encode("Which Restaurant?")

In [55]:
nltk.download('stopwords')
device = "cuda:0" if torch.cuda.is_available() else "cpu"
_filter = lambda x: " ".join([word for word in x.lower().split() if word not in stopwords.words('english')])

t = GPT2Tokenizer.from_pretrained('gpt2')
path_to_intents = os.path.join('..', 'data', 'raw')
intents = os.listdir(path_to_intents)

data = {}
for intent in intents:
    data[intent] = {}
    data[intent]['df'] = pd.read_csv(os.path.join(path_to_intents, intent, intent + '.csv')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at C:\Users\dhruv\.pytorch_pretrained_bert\f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at C:\Users\dhruv\.pytorch_pretrained_bert\d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

In [49]:
x = data[intent]['df'][ ['geographic_poi', 'city', 'state', 'country'] ]
x[[not all(x) for x in pd.isna(data[intent]['df'][ ['geographic_poi', 'city', 'state', 'country'] ].values)]]

Unnamed: 0,geographic_poi,city,state,country
0,Horseshoe Lake State Fish and Wildlife Area,,,
1,Monterey Bay National Marine Sanctuary,,,
3,,,AK,
4,,Princeton Junction,,
5,Nationalpark Nevado Tres Cruces,,,
6,,Tiplersville,,South Sudan
7,,,GA,
9,,Haigler,,Bosnia and Herzegovina
10,,,,Ãland
11,Rio Grande Wild and Scenic River,,,


[True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 T

In [58]:
data['GetWeather']['df']

Unnamed: 0.1,Unnamed: 0,text,timeRange,spatial_relation,geographic_poi,city,country,condition_description,current_location,condition_temperature,state
0,0,What will the weather be this year in Horsesho...,this year,,Horseshoe Lake State Fish and Wildlife Area,,,,,,
1,1,Will it be sunny one hundred thirty five days...,one hundred thirty five days from now,,Monterey Bay National Marine Sanctuary,,,sunny,,,
2,2,Is it supposed to rain nearby my current loca...,0 o'clock,nearby,,,,rain,current location,,
3,3,"what is the forecast starting on september 1, ...","september 1, 2039",,,,,,,chillier,AK
4,4,how cold is it in Princeton Junction,,,,Princeton Junction,,,,cold,
5,5,weather in Nationalpark Nevado Tres Cruces on ...,"mar. 4th, 2020",,Nationalpark Nevado Tres Cruces,,,,,,
6,6,"What will be wind speed in Tiplersville , Sout...",,,,Tiplersville,South Sudan,wind,,,
7,7,whats the weather in GA,,,,,,,,,GA
8,8,what is the weather at my current location,,,,,,,current location,,
9,9,Will it snow in Haigler Bosnia and Herzegovina,,,,Haigler,Bosnia and Herzegovina,snow,,,
