In [1]:
import re
import numpy as np
import os
os.sys.path.append('../1/')
from z2 import loader
from math import log
import sys
import heapq
import collections


vowels = list('aeioóuyąę') + list('aeioóuyąę'.upper())
compacted_vovels = ['i' + x for x in vowels if x != 'i']
word2tag = dict()
tag2word = dict()


def stringNorm(sent, num=False):
    regex = re.compile(f'[,\.!?:;\'{"0-9" if not num else ""}\*\-“…\(\)„”—»«–––=\[\]’]')
    return regex.sub('',sent.lower())

def bigrams2unigrams(bigrams):
    return {w1: sum([float(bigrams[w1][w2]) for w2 in bigrams[w1]])/2 for w1 in bigrams}

def count_syllable(phrase, verose=False):
    res = 0
    for i, letter in enumerate(phrase):
        if letter in vowels:
            res += 1
            if verose:
                print(letter)
        if phrase[i:i+2] in compacted_vovels:
            res -= 1
            if verose:
                print(phrase[i:i+2])
    return res


with open("data/supertags.txt") as tags:
    for line in tags:
        word, tag = stringNorm(line, num=True).split()
        word2tag[word] = tag
        if tag in tag2word:
            tag2word[tag].append(word)
        else:
            tag2word[tag] = [word]
            

bigrams = loader('../1/poleval_2grams.txt', cut = 10)
rev_bigram = {w2:{w1:bigrams[w1][w2] } for w1 in bigrams for w2 in bigrams[w1] }
unigrams = bigrams2unigrams(bigrams)
safeGrams = {x for x in rev_bigram}.intersection({y for y in bigrams})

with open('data/pan-tadeusz.txt') as f:
    pt = [stringNorm(x).split() for x in f if  count_syllable(stringNorm(x)) == 13]
    
acents = collections.Counter([
    tuple([count_syllable(y) for y in x])
    for x in pt
])


0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
35000000
36000000
37000000
38000000
39000000
40000000
41000000
42000000
43000000
44000000
45000000
46000000
47000000
48000000
49000000
50000000
51000000
52000000
53000000
54000000
55000000
56000000
57000000
58000000
59000000


In [2]:

def get_rym(w):
    best = None
    for i in range(len(w)):
        if count_syllable(w[i:]) == 2:
            best = w[i:]
    return best

def sample_verset():
    try:
        index = np.random.choice(np.arange(len(pt)))
        rym = get_rym(pt[index][-1])
        if get_rym(pt[index + 1][-1]) == rym:
            return pt[index], pt[index + 1]
        elif get_rym(pt[index - 1][-1]) == rym:
            return pt[index], pt[index - 1]
        return sample_verset()
    except:
        return sample_verset()
    

def get_accents(phrase):
    return [count_syllable(x) for x in phrase]


In [59]:
def sameTags(w):
    if w in word2tag:
        return tag2word[word2tag[w]]
    elif ('^' + w)[-3:] in word2tag:
        return tag2word[word2tag[('^' + w)[-3:]]]
    else:
        raise Exception(f"***niestety nie ma {w} i {('^' + w)[-3:]}***\n")
        x = np.random.choice(list(tag2word),1)
        return tag2word[x[0]]
    
def createAltWords(accent, verse, rime=None):
    return [list(
        set(
            filter(
                lambda x: count_syllable(x) == accent[i],
                sameTags(w)
            )
        ).intersection(
            {y for y in safeGrams}
        ))
        for i, w in enumerate(verse)
    ]

def getProb2(word, prev):
    if word in bigrams[prev]:
#         print('aa: ', bigrams[prev][word])
        return bigrams[prev][word]
    return 0.001

def getRandWord(w):
    p = np.array([unigrams[x] for x in w])
    p = p/np.sum(p)
    
    if len(w) > 0:
        choosen = np.random.choice(w, 1, p=p)
        return choosen[0]
    else:
        raise Exception("Pusto")

def getRandWord2Rev(words, _next):
    probs = np.array([getProb2(_next, x) for x in words]).astype(float)
    probs = probs / np.sum(probs)
    return str(np.random.choice(words, 1, p=probs)[0])

def generateAccent(k=1):
    p = np.array([acents[x] for x in acents])
    p = p/np.sum(p)
    accs = np.random.choice(len(list(acents)),k, p=p)
    return (list(acents)[x] for x in accs)

In [65]:
licznik = 0
while True:
    try:
        v1, v2 = sample_verset()
        a1, a2 = get_accents(v1), get_accents(v2)
#         a1, a2 = generateAccent(2)
#         print(a1, a2)
        
        alt1 = createAltWords(a1,v1)
        sent1 = []
        old = getRandWord(alt1[-1])
        sent1.append(old)

        for w in reversed(alt1[:-1]):
            choosen = getRandWord2Rev(w, old)
            sent1.append(choosen)
            old = choosen

        sent1.reverse()

        alt2 = createAltWords(a2,v2)
        alt2[-1] = list(filter(lambda x: get_rym(x) == get_rym(sent1[-1]), alt2[-1]))
        sent2 = []
        
        if len(alt2[-1]) < 2 and alt2[-1][0] == sent1[-1]:
            raise Exception("Ten sam wyraz")
        else:
            old = getRandWord(alt2[-1])
            while old == sent1[-1]:
                old = getRandWord(alt2[-1])
                
        sent2.append(old)

        for w in reversed(alt2[:-1]):
            choosen = getRandWord2Rev(w, old)
            sent2.append(choosen)
            old = choosen
        sent2.reverse()   

        print(' '.join(v1))
        print(' '.join(v2))
        print()
        print(' '.join(sent1))
        print(' '.join(sent2))
        break
    except:
        licznik += 1
        print(f"Próba nr: {licznik}")

poważnie palec wielki ku ziemi naginał
a potem machnął ręką jak gdyby przecinał

obecnie projekt ważny do świecy prowadził
nad wapniem zerwał strugą sęp cudnie gromadził


In [62]:
a, b = generateAccent(2)
a,b

((4, 1, 2, 1, 2, 3), (2, 2, 0, 1, 2, 0, 2, 1, 3))