In [90]:
import torch
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [61]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")

In [62]:
model = GPT2LMHeadModel.from_pretrained("gpt2-xl", pad_token_id=tokenizer.eos_token_id)

In [68]:
rand_messages = []
with open('rand_messages.txt', 'r') as f:
    for line in f:
        rand_messages.append(line)
gen_messages = []
with open('gen_messages.txt', 'r') as f:
    for line in f:
        gen_messages.append(line)

In [182]:
def make_random_input_str(msgs):
    start = np.random.randint(len(msgs)-10)
    valid_msgs = 0
    input_str = ''
    while valid_msgs < 10:
        msg = msgs[start]
        if '<missing_message>' not in msg:
            input_str += msg
            valid_msgs += 1
        else:
            pass
        start += 1
    input_str += 'pfaendtner:'
    return input_str

inputs = make_random_input_str(rand_messages)
inputs

'coco: I am wondering if you guys can understand this..\ncoco: Believe it or not.. it\'s english\nmaneki_neko: Happy New Year everyone!\npfaendtner: Was the PRG new years resolution to abandon Slack?! :wink:\nluizoliveira: Funny enough, that’s one of my resolutions, haha. I’m gonna use it only during the mornings when I’m commuting to work and in the night, at home. If I don’t promptly reply, that’s the reason.\npfaendtner: this makes me unreasonably happy\npfaendtner: Can anyone subscribe this channel to Logan Paul’s vlogs so they appear here automatically?\ncnyambr: It is the third day since Oregonians have started pumping their own gas  Lolololol\nanotherjoshsmith: wesleybeckner A quantitative description of the nontrivial difference between friends and colleagues. \narushi: "We’re getting dumber while our phones, cars, and homes are getting smarter."\npfaendtner:'

In [183]:
# input_ids = tokenizer.encode("[Scene: Central Perk, Ross, Chandler, and Phoebe are there. Joey is working.]\nRoss: Hey, remember when I had a monkey?\nChandler: Yeah.\nRoss: Yeah, what, what was I thinking?",
#                              return_tensors='pt')
# input_ids = tokenizer.encode("The special problem we tried to get at with these lectures was to maintain the interest of the very enthusiastic and rather smart students coming out of the high schools and into Caltech. They have heard a lot about how interesting and exciting physics is—the theory of relativity, quantum mechanics, and other modern ideas. By the end of two years of our previous course, many would be very discouraged because there were really very few grand, new, modern ideas presented to them. They were made to study inclined planes, electrostatics, and so forth, and after two years it was quite stultifying. The problem was whether or not we could make a course which would save the more advanced and excited student by maintaining his enthusiasm.",
#                              return_tensors='pt')
# input_ids = tokenizer.encode("I have a friend who's an artist and has sometimes taken a view which I don't agree with very well. He'll hold up a flower and say 'look how beautiful it is', and I'll agree. Then he says 'I as an artist can see how beautiful this is but you as a scientist take this all apart and it becomes a dull thing', and I think that he's kind of nutty. First of all, the beauty that he sees is available to other people and to me too, I believe. Although I may not be quite as refined aesthetically as he is ... I can appreciate the beauty of a flower. At the same time, I see much more about the flower than he sees. I could imagine the cells in there, the complicated actions inside, which also have a beauty.",
#                              return_tensors='pt')
input_ids = tokenizer.encode(inputs,
                             return_tensors='pt')

In [184]:
# open-ended
# sample_outputs = model.generate(
#                     input_ids,
#                     do_sample=True,
#                     max_length=500,
#                     top_k=75,
#                     top_p=0.95,
#                     num_return_sequences=5
#                     )

# single-response
sample_outputs = model.generate(
                    input_ids,
                    do_sample=True,
                    eos_token_id=198,
                    max_length=len(input_ids[0])+50,
                    top_k=75,
                    top_p=0.95,
                    num_return_sequences=5
                    )

In [203]:
print("Context:\n" + 100 * '-')
print(tokenizer.decode(sample_outputs[0][:len(input_ids[0])-7],
                       skip_special_tokens=True))
print("\nResponses\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    print("{}".format(tokenizer.decode(sample_output[len(input_ids[0])-7:],
                                              skip_special_tokens=True)).replace('pfaendtner', 'gpt-jim'))

Context:
----------------------------------------------------------------------------------------------------
coco: I am wondering if you guys can understand this..
coco: Believe it or not.. it's english
maneki_neko: Happy New Year everyone!
pfaendtner: Was the PRG new years resolution to abandon Slack?! :wink:
luizoliveira: Funny enough, that’s one of my resolutions, haha. I’m gonna use it only during the mornings when I’m commuting to work and in the night, at home. If I don’t promptly reply, that’s the reason.
pfaendtner: this makes me unreasonably happy
pfaendtner: Can anyone subscribe this channel to Logan Paul’s vlogs so they appear here automatically?
cnyambr: It is the third day since Oregonians have started pumping their own gas  Lolololol
anotherjoshsmith: wesleybeckner A quantitative description of the nontrivial difference between friends and colleagues. 
arushi: "We’re getting dumber while our phones, cars, and homes are getting smarter."

Responses
-----------------------

### Slack Dataset

In [1]:
import sqlite3

In [22]:
conn = sqlite3.connect('prg_msg.db')
c = conn.cursor()
tables = ['users', 'rand_messages', 'gen_messages']
table_cols = []
data = []
for table in tables:
    c.execute("select * from %s where 1=0;" % table)
    table_cols = [d[0] for d in c.description]
    select_str = ''
    for col in table_cols:
        select_str += '{}, '.format(col)
    select_str = select_str[:-2]
    c.execute("select {} from {}".format(select_str, table))
    data.append(c.fetchall())

In [66]:
users = data[0]
rand_messages = data[1]
gen_messages = data[2]

In [25]:
users_dict = {}
for user in users:
    users_dict[user[0]] = user[1]

In [55]:
def decode_message(msg, user_dict):
    user = user_dict[msg[1]]
    msg_text = msg[2]
    if msg_text == '':
        msg_text = '<missing_message>'
    msg_out = '{}: {}'.format(user, msg_text)
    return msg_out

In [67]:
with open('rand_messages.txt', 'w') as f:
    for msg in reversed(rand_messages):
        f.write('{}\n'.format(decode_message(msg, users_dict)))

with open('gen_messages.txt', 'w') as f:
    for msg in reversed(gen_messages):
        f.write('{}\n'.format(decode_message(msg, users_dict)))

In [58]:
len(rand_messages), len(gen_messages)

(7297, 5498)