In [1]:
def load_unigrams(file):
    K = 5
    unigrams = {}
    bigrams = {}
    with open(file, 'r') as f:
        for line in f:
            count = int(line.split()[0])
            w1 = line.split()[1]
            w2 = line.split()[2]
            if count >= K:
                
                if w1 not in bigrams:
                    bigrams[w1] = {}
                bigrams[w1][w2] = count
                
                if w1 not in unigrams:
                    unigrams[w1] = 1
                if w2 not in unigrams:
                    unigrams[w2] = 1
                unigrams[w1] += 1
                unigrams[w2] += 1
    return unigrams,bigrams

In [2]:
unigrams,bigrams = load_unigrams("poleval_2grams.txt")

In [3]:
def load_supertags(file):
    d_suf_words = {}
    d_tags = {}
    d_words = {}
    with open(file, 'r') as f:
        for line in f:
            t = line.split()[1]
            w = line.split()[0]
            ws = ("^" + w)[-3:]
            d_words[w] = t
            if ws not in d_suf_words:
                d_suf_words[ws] = {}
            if t not in d_suf_words:
                d_suf_words[ws][t] = 0
            d_suf_words[ws][t] += 1
            if t not in d_tags:
                d_tags[t] = {}
            if w in unigrams:
                d_tags[t][w] = unigrams[w]
            else:
                d_tags[t][w] = 1
    return (d_suf_words, d_tags, d_words)
def gen_tag(d_t):
    tags = list (d_t.keys())
    prob = list ( d_t.values() )
    sum_prob = sum (prob)
    r = np.random.choice(len (tags) , p = [p_r/sum_prob for p_r in prob])
    return tags[r]

In [4]:
d_suf_words, d_tags, d_words = load_supertags("supertags.txt")

In [5]:
import re
import numpy as np
sylabs = ['a','ą','e','ę','i','o','ó','u','y']
d_sylabs = ['i' + i for i in sylabs if i != 'i']
sylabs += [i.upper() for i in sylabs]

def count_sylabs(s):
    ans = []
    for w in s:
        c = 0
        for l in range( len(w) - 1):
            if w[l] in sylabs:
                c += 1
            if w[l] + w[l + 1] in d_sylabs:
                c -= 1
        if w[-1] in sylabs:
            c += 1
        if c != 0:
            ans.append(c)
    return ans
          
def load_poem(file):
    poem_tags = []
    poem_sylabs = []
    with open(file, 'r') as f:
        for line in f:
            c = count_sylabs(line.split())
            if sum(c) == 13:
                poem_sylabs.append(c)
                s = [i for i in re.split("\s|\.|\,|\!|\:|\?|\;|\|\(|\)|\"...", line.lower()) if len(i) > 0 ]
                tags = []
                for w in s:
                    if w in d_words:
                        tags.append(d_words[w])
                    else:
                        #print("Could not find word:", w)
                        if ("^" + w)[-3:] in d_suf_words:
                            tags.append(gen_tag (d_suf_words[("^" + w)[-3:]]) )
                poem_tags.append(tags)
                #print(s)
                #print(tags)
                        
    return poem_tags,poem_sylabs

In [6]:
poem_tags, poem_sylabs = load_poem("pan-tadeusz.txt")

In [7]:
N = 0
for u in unigrams:
    N += unigrams[u]
print(N)

14058114


In [8]:
from math import log as ln
def gen_word2(tag,syl,rhym,k):
    words_tmp = list (d_tags[tag].keys())
    probs_tmp = list ( d_tags[tag].values() )
    words = []
    probs = []
    
    if rhym:
        for w in range(len(words_tmp)):
            if sum(count_sylabs(words_tmp[w].split())) == syl and get_rhym(words_tmp[w]) == rhym:
                words.append(words_tmp[w])
                probs.append(probs_tmp[w])
    else:
        for w in range(len(words_tmp)):
            if sum(count_sylabs(words_tmp[w].split())) == syl:#words_tmp[w] in d_sylabs[syl]:
                words.append(words_tmp[w])
                probs.append(probs_tmp[w])
            #if (sum(count_sylabs(words_tmp[w]))) == syl:
                #print(words_tmp[w], sum(count_sylabs(words_tmp[w])), syl)
    if len(words) == 0:
        return False  
    sum_probs = sum (probs)
    ans = []
    for i in range(k):
        r = np.random.choice(len (words) , p = [p_r/sum_probs for p_r in probs])
        if words[r] not in ans:
            ans.append(words[r])
        else:
            if 2 * len(words) > k:
                i -= 1
    return ans
def PPMI(w1, w2):
    ans = 0
    if w1 in bigrams and w2 in bigrams[w1]:
        ans = ln(bigrams[w1][w2] * N / (unigrams[w1] * unigrams[w2]) )
    return ans 
def rate(words):
    r = 0
    for w in range (len(words) - 1):
        r += PPMI(words[w], words[w + 1])
    return r

def gen_lines(k, gen_sylabs, gen_tags):
    
    def local_search(states, sylabs, tags, rhym):
        if not states:
                return False
        if len(sylabs) == 0:
            return states[0]
        new_states = []
        for s in states:
            tag = random_tag()
            if len(tags) > 0:
                tag = tags[0]
            if rhym and len(sylabs) == 1:
                beam = gen_word2(tag, sylabs[0],rhym,k)
            else:
                beam = gen_word2(tag, sylabs[0],False,k)
            if not beam:
                return False
            for b in beam:
                new_s = s + " " + b
                new_states.append(new_s)
        new_states.sort(key = lambda s: rate(s.split()),reverse = True)
        #print(new_states[:k])
        return local_search(new_states[:k], sylabs[1:], tags[1:], rhym)
        
            
    beam = gen_word2(gen_tags[0][0], gen_sylabs[0][0],False,k)
    l1 = local_search(beam, gen_sylabs[0][1:], gen_tags[0][1:], False)
    if not l1:
        return False
    beam = gen_word2(gen_tags[1][0], gen_sylabs[1][0],False,k)
    l2 = local_search(beam, gen_sylabs[1][1:], gen_tags[1][1:], get_rhym(l1.split()[-1]))
    if not l2:
        return False
    return l1, l2

In [12]:
import random
def get_rhym(w):
    c = 0
    #print(w)
    if w[-1] in sylabs:
        c += 1
    for l in range(len(w) - 2, 0, -1):
        if w[l] in sylabs:
            c += 1
        if w[l] + w[l + 1] in d_sylabs:
            c -= 1
        if c == 2:
            return w[l:]
    return w

def random_tag():
    return random.choice(list(d_tags.keys()))

def gen_poem():
    for i in range(100):
        r = random.randint(0, len(poem_sylabs) - 2)
        gen_sylabs = [poem_sylabs[r + r % 2], poem_sylabs[r + r % 2 + 1]]
        gen_tags = [poem_tags[r + r % 2], poem_tags[r + r % 2 + 1]]
        while len(gen_sylabs[0]) < 7 and len(gen_sylabs[1]) < 7 and sum(gen_sylabs[0]) == 12 and sum(gen_sylabs[1]) == 12:
            r = random.randint(0, len(poem_sylabs) - 2)
            gen_sylabs = [poem_sylabs[r + r % 2], poem_sylabs[r + r % 2 + 1]]
            gen_tags = [poem_tags[r + r % 2], poem_tags[r + r % 2 + 1]]
        
        ans = gen_lines(10, gen_sylabs, gen_tags)
        if ans:
            return ans[0],ans[1]
        #else:
            #print("Couldn't make rhym")

In [14]:
for j in range(5):
    for i in gen_poem():
        print(i)
    print()

płyty zajęli wszyscy mieć na prowadzeniu
minister mówił wówczas kościoła dereniu

tadeusz mości nie miał gdy tworzono ściany
upłynął sierżant prawny i strzelił organy

jeśli ją szukał tu nie więc pewnie pokonał
chałturowcze bodaj mnie wielki talent skonał

wiesz co na całej polsce wsiołkowskich powodu
uchwałą niezależną od wielkiego głodu

raczej jak na budowę trzeba jeszcze było
oraz po zakończeniu skąd się wydarzyło

