In [None]:
import re
import math
import random
import morfeusz2
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [3]:
def divide_into_chapters(book_lines):
    chapter_regex = re.compile(r'^[MDCLXVI]+\.(?=\s|$)', re.I)

    chapters = []
    current_chapter = []

    for line in book_lines:
        if line.strip() == "-----":
            break

        if chapter_regex.match(line.strip()):
            if current_chapter and "".join(current_chapter).strip():
                chapters.append("".join(current_chapter).strip())
                current_chapter = []
            current_chapter.append(line)
        else:
            current_chapter.append(line)

    if current_chapter:
        chapters.append("".join(current_chapter).strip())

    return chapters


def read_book():
    book_volume_one_path = "lalka/lalka-tom-pierwszy.txt"
    with open(book_volume_one_path, "r") as f:
        book_volume_one = f.readlines()

    start_index = next((i for i, line in enumerate(book_volume_one) if "Tom I" in line or line.strip() == "Tom I"), 0)
    book_volume_one = book_volume_one[start_index + 1:]

    book_volume_one_chapters = divide_into_chapters(book_volume_one)
    print(f"Read {len(book_volume_one_chapters)} chapters from volume one.")
    for chapter in book_volume_one_chapters:
        with open(f"lalka/chapters/volume_one/chapter_{book_volume_one_chapters.index(chapter) + 1}.txt", "w") as f:
            f.write(chapter)

    book_volume_two_path = "lalka/lalka-tom-drugi.txt"
    with open(book_volume_two_path, "r") as f:
        book_volume_two = f.readlines()

    start_index = next((i for i, line in enumerate(book_volume_two) if "Tom II" in line or line.strip() == "Tom II"), 0)
    book_volume_two = book_volume_two[start_index + 1:]

    book_volume_two_chapters = divide_into_chapters(book_volume_two)
    print(f"Read {len(book_volume_two_chapters)} chapters from volume two.")
    for chapter in book_volume_two_chapters:
        with open(f"lalka/chapters/volume_two/chapter_{book_volume_two_chapters.index(chapter) + 1}.txt", "w") as f:
            f.write(chapter)

    return book_volume_one_chapters + book_volume_two_chapters

book = read_book()

Read 20 chapters from volume one.
Read 18 chapters from volume two.


In [4]:
polish_stopwords = set()
with open("polish_stopwords.txt", "r") as f:
    for line in f:
        polish_stopwords.add(line.strip())
        

def text_to_base_words(text):
    polish_morph = morfeusz2.Morfeusz()

    all_words = [word.lower() for word in text.split() if word.isalpha() and len(word) > 3]
    analyzed_base_words = [polish_morph.analyse(word)[0][2][1].lower() for word in all_words]
    analyzed_base_words = [w.split(":")[0].strip() if ":" in w else w for w in analyzed_base_words]
    
    return [word for word in analyzed_base_words if word not in polish_stopwords and len(word) >= 3]

words_in_book = text_to_base_words(" ".join(book))
words_in_chapters = [text_to_base_words(chapter) for chapter in book]

In [79]:
word_freq_in_book = {}
for word in words_in_book:
    word_freq_in_book[word] = word_freq_in_book.get(word, 0) + 1
sorted_word_freq_in_book = sorted(word_freq_in_book.items(), key=lambda x: x[1], reverse=True)

print("Top 20 words in the book:")
for word, freq in sorted_word_freq_in_book[:20]:
    print(f"{word}: {freq}")

WordCloud = WordCloud(width=800, height=400, background_color='white')
WordCloud.generate_from_frequencies(word_freq_in_book)
WordCloud.to_file("lalka/wordclouds/wordcloud_book.png")

word_freq_in_chapters = [{} for _ in range(len(words_in_chapters))]
sorted_word_freq_in_chapters = [{} for _ in range(len(words_in_chapters))]
for i, chapter_words in enumerate(words_in_chapters):
    for word in chapter_words:
        word_freq_in_chapters[i][word] = word_freq_in_chapters[i].get(word, 0) + 1
    sorted_word_freq_in_chapters[i] = sorted(word_freq_in_chapters[i].items(), key=lambda x: x[1], reverse=True)

for i, chapter_freq in enumerate(sorted_word_freq_in_chapters):
    print(f"\nTop 10 words in chapter {i + 1}:")
    for word, freq in chapter_freq[:10]:
        print(f"{word}: {freq}")

Top 20 words in the book:
wokulski: 1441
panna: 1096
mówić: 750
mieć: 626
móc: 518
swój: 514
ażeby: 512
chcieć: 507
rzec: 472
izabela: 452
chwila: 437
człowiek: 417
soba: 411
odeprzeć: 386
zrobić: 347
myśleć: 314
wielki: 302
czas: 296
ręka: 291
zacząć: 282

Top 10 words in chapter 1:
wokulski: 11
sklep: 10
radca: 10
szkoła: 6
rok: 5
służyć: 5
ajent: 5
firma: 4
mincel: 4
kupiec: 4

Top 10 words in chapter 2:
ignacy: 13
sklep: 10
szanowny: 10
swój: 9
rzecki: 9
mraczewski: 7
subiekt: 6
czas: 6
chwila: 6
kalosz: 6

Top 10 words in chapter 3:
sklep: 19
ojciec: 18
napoleon: 14
ciotka: 14
raczek: 14
stary: 13
mincel: 13
swój: 10
ażeby: 10
czas: 10

Top 10 words in chapter 4:
wokulski: 24
ignacy: 20
mieć: 12
mówić: 11
rzec: 9
tysiąc: 8
odeprzeć: 8
swój: 7
chcieć: 7
rzecki: 7

Top 10 words in chapter 5:
panna: 41
izabela: 21
tomasz: 18
swój: 18
świat: 14
wielki: 13
ażeby: 12
czas: 12
mieć: 11
piękny: 10

Top 10 words in chapter 6:
panna: 92
izabela: 35
ręka: 26
ojciec: 24
ażeby: 20
florentyna: 

In [78]:
def calculate_tf_idf_weights(word_freq_in_chapters):
    chapters_number = len(word_freq_in_chapters)

    weights = {}
    for chapter_index, chapter_words in enumerate(word_freq_in_chapters):
        weights[chapter_index] = {}
        for word_in_chapter, count_in_chapter in chapter_words.items():
            chapters_with_word = sum([1 if word_in_chapter in chapter else 0 for chapter in word_freq_in_chapters])
            weights[chapter_index][word_in_chapter] = count_in_chapter * math.log(chapters_number / (1 + chapters_with_word))

    return weights

tf_idf_weights = calculate_tf_idf_weights(word_freq_in_chapters)

tf_idf_weights

{0: {'wyglądać': 0.1112256351102244,
  'firma': 6.76670404268429,
  'mincel': 6.2325784721862,
  'wokulski': 0.293350717903774,
  'szkło': 1.8458266904983307,
  'początki': 1.55814461804655,
  'rok': 0.27033610635137895,
  'świat': 0.05406722127027579,
  'polityczny': 1.072636802264849,
  'zajmować': 0.3794896217049037,
  'pokój': 0.1410785982599056,
  'wybór': 1.55814461804655,
  'nowy': 0.1112256351102244,
  'papież': 2.9444389791664403,
  'szansa': 2.028148247292285,
  'europejski': 1.8458266904983307,
  'warszawski': 1.6916760106710724,
  'kupiec': 1.2215265982047276,
  'tudzież': 1.8590719172483514,
  'inteligencja': 2.9444389791664403,
  'pewny': 1.3862943611198906,
  'okolica': 1.072636802264849,
  'krakowski': 1.55814461804655,
  'przedmieście': 1.8458266904983307,
  'niemniej': 1.6916760106710724,
  'gorąco': 0.8649974374866045,
  'interesować': 0.9295359586241757,
  'przyszłość': 1.4403615823901663,
  'galanteryjny': 3.2179104067945468,
  'sklep': 1.112256351102244,
  'renomo

In [62]:
for chapter_index, chapter_weights in tf_idf_weights.items():
    wc = WordCloud(width=800, height=400, background_color='white')
    wc.generate_from_frequencies(chapter_weights)
    wc.to_file(f"lalka/wordclouds/chapter_{chapter_index}.png")

In [61]:
def best_fitting_chapter(tf_idf_weights, word, topk):
    polish_morph = morfeusz2.Morfeusz()
    core_word = polish_morph.analyse(word)[0][2][1].lower()

    weights = {}
    for chapter_index, chapter_weights in tf_idf_weights.items():
        if core_word in chapter_weights:
            weights[chapter_index] = chapter_weights[core_word]
        else:
            weights[chapter_index] = 0

    top_weights = sorted(weights.items(), key=lambda x: x[1], reverse=True)
    
    return top_weights[:topk]

best_fitting_chapter(tf_idf_weights, "sklep", 5)
best_fitting_chapter(tf_idf_weights, "geist", 5)

[(22, 81.04650474983383),
 (35, 15.759042590245468),
 (37, 2.2512917986064953),
 (0, 0),
 (1, 0)]

In [64]:
def find_top_successors(all_words, target_word, topk):
    counts = {}
    for index, word in enumerate(all_words):
        if index + 1 < len(all_words):
            if word != target_word:
                continue

            successor = all_words[index + 1]
            counts[successor] = counts.get(successor, 0) + 1
    
    top_count = sorted(counts.items(), key=lambda x: x[1], reverse=True)

    return top_count[:topk]

top_successor = {}
for word in set(words_in_book):
    top_successor[word] = find_top_successors(words_in_book, word, 5)

top_successor

{'odwracać': [('głowa', 2),
  ('matka', 1),
  ('nałożyć', 1),
  ('tyć', 1),
  ('ażeby', 1)],
 'zasługiwać': [('dawać', 1),
  ('wysoki', 1),
  ('myśleć', 1),
  ('odeprzeć', 1),
  ('nazywać', 1)],
 'prezencik': [('dostać', 1)],
 'swój': [('mieszkanie', 8),
  ('wokulski', 6),
  ('gabinet', 5),
  ('towarzysz', 5),
  ('mieć', 5)],
 'żyć': [('znaczyć', 1),
  ('łaska', 1),
  ('wyssać', 1),
  ('chcieć', 1),
  ('zawołać', 1)],
 'wczesny': [('plecy', 1), ('poradzić', 1), ('motyl', 1)],
 'zawiązywać': [('towarzystwo', 1),
  ('plac', 1),
  ('spółka', 1),
  ('stosunki', 1)],
 'preparować': [('pamiętać', 1)],
 'rześki': [('zwrócić', 1), ('wiatr', 1)],
 'kręcić': [('głowa', 6),
  ('mieszkać', 1),
  ('łasić', 1),
  ('myślić', 1),
  ('sklep', 1)],
 'staś': [('pisać', 1),
  ('własny', 1),
  ('dopiero', 1),
  ('osiem', 1),
  ('zamknąć', 1)],
 'zwiędły': [('powtórzyć', 1)],
 'twierdza': [('dzień', 1)],
 'obniżyć': [('skalić', 1)],
 'siać': [('niepochwytny', 1)],
 'pora': [('robić', 1),
  ('wytknąć', 1),
 

In [77]:
def random_chapter(top_successor, word, length):
    chapter = [word]
    current = word
    for _ in range(length):
        successors = top_successor.get(current, [])
        words, weights = zip(*successors)
        next_word = random.choices(words, weights=weights, k=1)[0]
        current = next_word

        chapter.append(next_word)

    return " ".join(chapter)

random_chapter(top_successor, "sklep", 50)
random_chapter(top_successor, "izabela", 50)
random_chapter(top_successor, "geist", 50)

'geist przeciągnąć mieć zamiar wyjście chętnie pójść sklep wokulski spojrzeć panna izabela pójść swój gabinet czytać nareszcie wejść drugie wokulski móc zrobić wielki świat mieć zamiar kupić soba sprawa trzymać ręka noga oprzeć broda ksiądz trzymać ręka noga brudny budynek mieć zamiar kupić ażeby móc miecić szczęście dodawać cichy westchnienie drzwi'