In [1]:
# data input file is an ndjson [newline delimited JSON format](http://ndjson.org/)
# this means there's a json object on each line so it can recognise line breaks

# read the file and create a list `all_lines` that contains all of these json objects

import json

all_lines = []
for line in open("blankmeanderings.ndjson"):
    all_lines.append(json.loads(line.strip()))

In [6]:
# just to see what those lines look like, let's pick a handful at random

import random

# each object has a key 'column_1' that contains the text of the line of poetry

random.sample(all_lines, 8)

[{'column_1': 'dense shell of oblivion. '},
 {'column_1': 'and the desire to please cling on to what was known, to become something that can no '},
 {'column_1': 'bromine exists; and hydrogen, hydrogen'},
 {'column_1': 'and she waits for their laughter to warm her'},
 {'column_1': 'longer be made sense of, and to end up just always searching for something to attempt to '},
 {'column_1': 'he pulls my knickers down. they fall between my ankles.'},
 {'column_1': 'we have flower particles inside our body now working as inner amulets.'},
 {'column_1': 'growths, shed sin displaced by other decayed life. ancestral matter - face mask?'}]

In [5]:
random.sample(all_lines, 8)

[{'column_1': 'we rise from our slumber ... yet what lies yonder? '},
 {'column_1': 'wildflowers caressing my legs being no less material than the desire to prove it all '},
 {'column_1': 'their broken bone arms and torn up flesh'},
 {'column_1': 'and the longing will overcome her'},
 {'column_1': "again by the ocean's dark belly. this is a different kind of 'hydrological cycle'."},
 {'column_1': 'pond life, sea monkey, primordial soup, amphibious egg, the moist soil that '},
 {'column_1': 'the soil beneath my viscera breeds glyphs of ancient symbiotic harmonies, casting a copious '},
 {'column_1': 'mate. and you pretend that the days of the excruciating sweet fruit have passed, because '}]

## concordances and counts

the corpus could be useful for collecting, counting and comparing lines of poetry with certain characteristics (dependant on your input file data)

In [8]:
# here's our first experiment: find every line of poetry in the corpus with the word "love"
# we do this using a regular expression that finds the string 'love' between two word boundaries, without respect to case:

import re
love_lines = [line['column_1'] for line in all_lines if re.search(r'\blove\b', line['column_1'], re.I)]

In [45]:
# just to see what we have, we'll take a random sample:
# (the number of lines you sample can be higher or lower depending on how many expressions of 'love' your file input contains)

random.sample(love_lines, 6)

['she scratches her love against the slime of the wall',
 'she scratches her love against the slime of the wall',
 'it is love that brings the sun out',
 'it is rather the mesmerizing complications of sentimentality, artificial jealousies, words that inebriate and deceive, the rhetoric of parting and eternal fidelities, literary nostalgiaâ€”all the histrionics of love.',
 "you love what i've done with the place? fossilised passages in a fine sausage casing gravel ",
 'to long for a perennial love']

In [17]:
# as a cut-up method poem, that's not bad all on its own... but let's do a little bit of Digital Humanities and make an aligned concordance of these lines, with the lines sorted alphabetically by the word following "love," using a bit of regular expression trickery:
# this will only work if you have enough 

longest = max([len(x) for x in love_lines]) # find the length of the longest line
center = longest - len("love") # and use it to create a "center" offset that will work for all lines

sorted_love_lines = sorted(
    [line for line in love_lines if re.search(r"\blove\b\s\w", line)], # only lines with word following
    key=lambda line: line[re.search(r"\blove\b\s", line).end():]) # sort on the substring following the match

for line in sorted_love_lines: # add [interger:interger] if you wish to sample only a specific slice of your line selection
    offset = center - re.search(r'\blove\b', line, re.I).start()
    print((" "*offset)+line) # left-pad the string with spaces to align on "love"

                                                                                                                                                                                                  she scratches her love against the slime of the wall
                                                                                                                                                                                                  she scratches her love against the slime of the wall
                                                                                                                                                                                                              it is love that brings the sun out
                                                                                                                                                                                                                you love what i've done with the place? fossilised passag

In [19]:
# as another experiment, let's find all of the words that occur between either "the" or "a" and the word "love." English being the way it is, these words are pretty much guaranteed to be adjectives, so this is an ersatz but effective way of getting a (non-exhaustive) list of adjectives that are used to describe love in the corpus.

found_adj = []
for line in love_lines:
    matches = re.findall(r"(the|a)\s(\b\w+\b)\s(\blove\b)", line, re.I)
    for match in matches: 
        found_adj.append(match[1])

In [28]:
# some adjectives at random:

# there was only 1 adjective to describe love in my input, being 'perennial' otherwise make number higher to perform next steps


random.sample(found_adj, 1)

['perennial']

In [16]:
# using the `Counter` object, we can easily count these up and find the (12) most common adjectives (used in the type of noun phrase we've identified) used to describe love:

from collections import Counter

Counter(found_adj).most_common(12)

## rhymes

stretches of language identified as poetry characteristically exhibit some variety of rhyming

let's set ourselves a task of finding random rhyming lines in the corpus

to do this, we need to know how words are pronounced... the way that words are spelled in English doesn't really tell us anything helpful about how the word is pronounced, so we need some alternate method to get that information

the [CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) is one such method: it's a big database of phonetic transcriptions for many thousands of English words

there is a Python library called [pronouncing] (https://pypi.org/project/pronouncing/) made by the amazing Allison Parrish (the person who inspired me to experiment with digital poetry) to make it very easy to work with the CMU Pronouncing Dictionary in Python

you can install it in with pip install pronouncing

In [71]:
!pip install pronouncing



In [35]:
# import pronouncing library like so

import pronouncing

In [66]:
# we'll consider two lines to rhyme with each other if the last words in the lines rhyme...
# to test this out, we'll pick a source word, say, "kiss," and find all of the words that rhyme with it:

source_word = "kiss"
source_word_rhymes = pronouncing.rhymes(source_word)

In [67]:
source_word_rhymes

['abyss',
 'alusuisse',
 'amiss',
 'bis',
 'biss',
 'bliss',
 "bliss'",
 'brys',
 'buice',
 'chris',
 "chris'",
 'chriss',
 'cris',
 'criss',
 'dis',
 'dismiss',
 'diss',
 'fariss',
 'fis',
 'fiss',
 'flis',
 'fliss',
 'furniss',
 'hiss',
 'kis',
 'kniss',
 'kris',
 'kriss',
 'lis',
 'liss',
 'mis',
 'miss',
 'phariss',
 'piss',
 'pris',
 'reminisce',
 'remiss',
 'resists',
 'resists',
 'riss',
 'risse',
 'rys',
 'sis',
 'stys',
 'suess',
 'suisse',
 'swiss',
 'this',
 "this'",
 'twiss',
 'vis',
 'wis',
 'wiss',
 'wyss']

In [68]:
# and then look through the lines of poetry in the corpus for lines that end with any of these rhyming words:

for line in all_lines:
    text = line['column_1']
    match = re.search(r'(\b\w+\b)\W*$', text)
    if match:
        last_word = match.group()
        if last_word in source_word_rhymes:
            print(text)

abyss
abyss


In [69]:
# depending on the size of your data input, looking through all the lines of poetry to find rhyming lines one-by-one might be pretty slow
# another approach is to use the `phones_for_word()` and `rhyming_part()` functions in the `pronouncing` library to pre-build a data structure with all of the lines in the corpus grouped with their rhymes
# the `phones_for_word()` function gives you the "phones" (sounds) of how a word is pronounced

phones = pronouncing.phones_for_word(source_word)[0] # words may have multiple pronunciations, so this returns a list
phones

'K IH1 S'

In [70]:
# the `rhyming_part()` function gives you just the portion of a string of phones that another word must share in order for them to be considered "rhyming"

pronouncing.rhyming_part(phones)

'IH1 S'

In [52]:
# next we build the data structure proposed above: a dictionary that maps rhyming parts to a dictionary that maps words with that rhyming part to the lines of poetry that they're found at the end of

from collections import defaultdict
by_rhyming_part = defaultdict(lambda: defaultdict(list))
for line in all_lines:
    text = line['column_1']
    if not(32 < len(text) < 48): # only use lines of uniform lengths
        continue
    match = re.search(r'(\b\w+\b)\W*$', text)
    if match:
        last_word = match.group()
        pronunciations = pronouncing.phones_for_word(last_word)
        if len(pronunciations) > 0:
            rhyming_part = pronouncing.rhyming_part(pronunciations[0])
            # group by rhyming phones (for rhymes) and words (to avoid duplicate words)
            by_rhyming_part[rhyming_part][last_word.lower()].append(text)

In [58]:
# a random key/value pair from this dictionary, so you can see its structure:

random_rhyming_part = random.choice(list(by_rhyming_part.keys()))
random_rhyming_part, by_rhyming_part[random_rhyming_part]

('IY1 S IH0 Z',
 defaultdict(list,
             {'creases': ['holding vulnerability in its creases',
               'holding vulnerability in its creases']}))

In [72]:
# many rhyming parts are found in multiple lines, but only with one unique word
# while it's true that identical words "rhyme," it's a little disingenuous to claim that we've made a computer program that finds rhyming lines of poetry if it's mostly just finding lines that end in the same word
# so we'll just find the groups from the `by_rhyming_part` dictionary that have at least two different line-ending words:

rhyme_groups = [group for group in by_rhyming_part.values() if len(group) >= 2]

In [75]:
# now, find (7) rhyming couplets by selecting a random rhyming group, sampling (2) keys (words) from that group, and printing a random line from both groups:

for i in range(7):
    group = random.choice(rhyme_groups)
    words = random.sample(list(group.keys()), 2)
    print(random.choice(group[words[0]]))
    print(random.choice(group[words[1]]))

IndexError: list index out of range

## markov chain text generation

markov chain text generation uses statistical information about word co-occurrence to build a model that allows you to generate text that looks similar to your source text

[Markovify](https://github.com/jsvine/markovify) is a great library for Python that makes it easy to build and generate from Markov chain models

install it by pip install markovify

In [77]:
!pip install markovify

Collecting markovify
  Downloading markovify-0.9.4.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting unidecode (from markovify)
  Using cached Unidecode-1.3.6-py3-none-any.whl (235 kB)
Building wheels for collected packages: markovify
  Building wheel for markovify (setup.py): started
  Building wheel for markovify (setup.py): finished with status 'done'
  Created wheel for markovify: filename=markovify-0.9.4-py3-none-any.whl size=18649 sha256=e9dd29291a3abee5c1f87b1cd266713fc52bc803cb855d6a5791c9b8afa12797
  Stored in directory: c:\users\jasmi\appdata\local\pip\cache\wheels\ca\8c\c5\41413e24c484f883a100c63ca7b3b0362b7c6f6eb6d7c9cc7f
Successfully built markovify
Installing collected packages: unidecode, markovify
Successfully installed markovify-0.9.4 unidecode-1.3.6


In [80]:
import markovify

In [117]:
# our goal is to use a Markov chain to generate new lines of poetry from the corpus
# markovify requires you to pass in your source text as a string, so first off we'll create a big string with a sample of the corpus, separated by newlines:
# (you can change the number as needed; I had to keep it low at 300 as I did not have anymore lines to work with)

big_poem = "\n".join([line['column_1'] for line in random.sample(all_lines, 300)])

In [118]:
# build the model

model = markovify.NewlineText(big_poem)

In [119]:
# and then generate some lines:

for i in range(14):
    print(model.make_sentence())

rituals stir within the metaverse serves as the womb,
mate. and you pretend that the days of the cyber in an unending lore.
she scratches her love against the slime of the metaverse womb, my life is a mere drop in the mind, and you find yourself moving against
the face of a more general aqueous facilitative capacity: pond life, sea monkey, primordial soup, amphibious egg, the moist soil that
patient, the sun refused to rise, and the ampersand, a languid swamp,
fur, and his head is full of light
citrus trees; cicadas exist; chicory, chromium,
i traverse the metaverse shapeshifts, bound solely by the sun
human bodies ingest reservoir bodies, while reservoir bodies are consumed by whale bodies - which then sink to the natural state, before this supernatural thing clouded
with one gulp then another, their bodies fill the length of her limbs like a mist-clung spider's web or the caress of an
here, in this fugue of flux, imbued with the fruit and
the wyrm will dance to the beat of the inevit

In [121]:
# this is okay but the lines don't make a lot of sense, and are sometimes too long
# you can constrain the length using Markovify's `.make_short_sentence()` method:

model.make_short_sentence(60)

'bromine exists; and blackberries, blackberries;'

In [130]:
# I find that Markov-generated text is best when you keep it short and force juxtapositions, otherwise the reader's attention will wander
# lets generate a series of short, haiku-esque poems of two to five Markov-generated lines, and ensure that the last line of each poem ends with a period:

for i in range(6):
    print()
    for i in range(random.randrange(1, 5)):
        print(model.make_short_sentence(40))
    # ensure last line has a period at the end, for closure
    print(re.sub(r"(\w)[^\w.]?$", r"\1.", model.make_short_sentence(40)))
    print()
    print("～ ❀ ～")


with it in the back of her limbs
unseen other than in the freedom of the
rock chalk and the eternal dance
bracken exists; and hydrogen, hydrogen.

～ ❀ ～

patient, the sun out
patient, the sun out
a pool of data this.

～ ❀ ～

None
i sat on something that can no
exist, and the desire to prove it all.

～ ❀ ～

the face of a mouth
and the eternal dance
and only the stars that
fur, and his mind is made of matted.

～ ❀ ～

shiver in the freedom of the moon
None
erasure poetry of the wall
porous flesh of the caves entrance
in the caves entrance.

～ ❀ ～

bracken exists; and hydrogen, hydrogen


TypeError: expected string or bytes-like object