### Pull in generated text files

In [1]:
import glob
from itertools import chain

In [2]:
file_names = glob.glob('data/generated_text/*.txt')
file_names

['data/generated_text/gpt2_gentext_20200912_202521.txt',
 'data/generated_text/gpt2_gentext_20200912_201909.txt',
 'data/generated_text/gpt2_gentext_20200912_202807.txt',
 'data/generated_text/gpt2_gentext_20200912_203040.txt',
 'data/generated_text/gpt2_gentext_20200912_201420.txt']

In [3]:
open_files = [open(file_name) for file_name in file_names]

### Clean and split generator output


In [4]:
def clean_generated_file(text):
    """
    take in a generated text file (single string containing multiple samples in mutliple batches)
    split into individual strings and remove generator artifact tokens

    input format (single string):
    
    'texttexttext<|endoftext|>
    <|startoftext|>textexttext<|endoftext|>
    ...
    \n====================\n
    texttexttext<|endoftext|>
    <|startoftext|>textexttext<|endoftext|>
    ...'
    
    input: string (file.read())
    returns: list of strings (separated and cleaned)
    """
 
    # there's some messiness in the endoftext/startoftext tokens that makes splitting on them inconsistent
    # so after some experimenting this approach seems to work better
    
    # split on < and > from '<|startoftext|>' and '<|endoftext|>' tokens
    text = text.split('<') # str -> ['str', 'str',...]
    text = [x.split('>') for x in text] # [['str'], ['str', 'str'], ...]
    text = list(chain(*text)) # flatten list ['str', 'str',...]
    # split elements on batch separator token
    text = [x.split('\n====================\n') for x in text] 
    text = list(chain(*text)) # flatten list
    # remove generator token artifacts
    rem_tokens = ['|startoftext|', '|endoftext|', '\n', '']
    text = [x for x in text if x not in rem_tokens]
    text = [x for x in text if '|' not in x] # catch tokens that somehow got split
    text = [x[1:] if x[0]=='\n' else x for x in text] # remove initial newlines

    # return list of cleaned texts
    return text

### Import scoring model and transformers

In [7]:
import pickle as pkl
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from score_text import score

In [8]:
with open('models/rfr_final.pkl', 'rb') as f:
    model = pkl.load(f)

with open('transformers/final_nmf.pkl', 'rb') as f:
    fac = pkl.load(f)

with open('transformers/final_vec.pkl', 'rb') as f:
    vec = pkl.load(f)

In [9]:
# take a test sample
test = open_files[3].read()
clean_test = clean_generated_file(test)
clean_test[0]

'High school is about to get a little scary when a group of fireflies is attacked by a giant blob of black specks, breaking her nose. At school, a few girls get their hands on a special toy, and they join forces with the blob warriors to fight...'

In [10]:
# test the scorer
score(clean_test[0], vec, fac, model)

6.39

### Combined clean and scoring function

In [27]:
import numpy as np
def clean_and_score_file(gen_file_string):
    # clean and split file into list of synopses
    synopsis_list = clean_generated_file(gen_file_string)
    
    scores = []
    for synopsis in synopsis_list:
        scores.append(score(synopsis, vec, fac, model))

    # zip scores and synopses, sort by score and reverse to have highest score first
    zipped = [list(zip(scores, synopsis_list))[idx] for idx in np.argsort(scores)]
    zipped.reverse()
    
    return zipped

In [29]:
# Test the combined function
test_scores = clean_and_score_file(test)
test_scores[:5]

[(7.46,
  'A god is born in the Wind World, where man and machine are made. Miko Sakamaki is the sole survivor of all of the creations, and as such, she does not have any friends. While attending freshman year, she meets Sayaka Hayase, an aspiring manga editor who lives in the Wind World. Hayase is particularly interested in the subject of money, and encourages her to start working at a "recycled jacket shop" to learn more about the topic. As she gradually grows interested in the daily grind of going from job to job, she begins to find that the path she has chosen is one that will prove to be a challenge. Thus, Miko attempts to support other people living within the wind world as well as reach out to her own kind. In the end, Miko finds herself being pursued by a strange entity who has her enslaved by the evil goddess, as she must not let her captors sustain her own life as she makes her way toward a new existence.'),
 (7.39,
  "While having trouble fitting in, Yasumi is asked by her o

### Score and store all of our files

In [40]:
# reset read cursor
open_files = [open(file_name) for file_name in file_names]

scored_files = []

# clean and score each file and store the lists of scored synopses
for gen_file in open_files:
    text = gen_file.read()
    cleaned_file = clean_and_score_file(text)
    scored_files.append(cleaned_file)

# flatten the lists and re-sort
scored_files = list(chain(*scored_files))
scored_files.sort(reverse=True, key=lambda x: x[0])


In [43]:
# Inspect the top 5 scoring samples
scored_files[:5]

[(7.46,
  "As a year progresses, the galaxy has begun to build up more and more the beautiful galaxy, and the 26th year of the Fourth Ledger is just around the corner. The leaders of the galaxy declare war on each other, and it seems they have something that their enemies doesn't... After years of hard work they finally defeat the galactic clothed ones and establish a peaceful galaxy together. That's right, friends Mikazuchi and Reiber are now fighting each other. Will they be able to settle a long standing dispute between the two? As the war goes on Mikazuchi, Reiber and Haibara discover the truth behind the war and their true motives. They even find yet another way to merge. Who is this mysterious entity that is attacking Mikazuchi? Is it really a new game or just the latest step of the battle royales for Mikazuchi and Reiber?"),
 (7.46,
  'A god is born in the Wind World, where man and machine are made. Miko Sakamaki is the sole survivor of all of the creations, and as such, she doe

### Save the scored samples to a CSV

In [50]:
import csv

with open('output/scored_gens_v1.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['', 'score', 'text'])
    for idx, syn in enumerate(scored_files):
        writer.writerow([idx, syn[0], syn[1]])

In [1]:
# Double check the CSV
import pandas as pd
syns = pd.read_csv('output/scored_gens_v1.csv')

### Explore the generated samples

In [32]:
# Play with the number and find some interesting generations
syns['text'][170]

'For the second consecutive year, the Mahou Shoujo Japan Drama Association has commissioned a manga adaptation of a chapter from the manga of the same name. The manga features the three main characters: the new heroines: Chairman Tachibana, the smart and charismatic president Kanna, and the handsome and charming yet reserved, "Wasp," as well as the shy and indifferent Kazuko. The story follows the Hinamizawa sisters, who are the first and only members of the Japan Drama Association. Hinamizawa is a junior high student who is the most handsome in the Japan, with a figure of over 300 cm. She is a very smart girl, and also the most popular in Japan, but she has her limits. In the story, Hinamizawa and Kanna are classmates, and they are also the only ones who can see the scenery of Hinamizawa\'s father\'s hometown—the place he once lived. The story also reveals that Kanna is twins, just like Kanna. The title character is also based on the anime, "The World," in Japan, and the title charact

In [33]:
syns['score'][170]

6.88

In [3]:
# score stats for our generations
syns['score'].describe()

count    1917.000000
mean        6.170391
std         0.563632
min         4.550000
25%         5.830000
50%         6.250000
75%         6.570000
max         7.460000
Name: score, dtype: float64