In [1]:
import os
import re
import html
from bs4 import BeautifulSoup
from bs4 import NavigableString
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import codecs
from nltk import word_tokenize
from nltk import pos_tag
import pandas as pd

In [2]:
import json
import numpy as np

In [63]:
import nltk
nltk.download('averaged_perceptron_tagger_ru')

[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     /Users/ika/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_ru.zip.


True

In [3]:
rhyme_files = []
corpus = []
poems_with_path = {}
start_path = '/Users/ika/Poetry_corpus/'
for root, dirs, files in os.walk(start_path):
    for fname in files:
        if not fname.startswith('.'):
            file_name = os.path.join(root, os.path.normpath(fname))
            short_name = re.sub(r'/Users/ika/Poetry_corpus/', '', file_name)
            rhyme_files.append(short_name)
            with codecs.open(os.path.join(root, fname), encoding='windows-1251') as f:
                raw_text = f.read()
                t = BeautifulSoup(raw_text, "lxml")
                corpus.append(t)
                poems_with_path.update({t: short_name})
                

In [4]:
STROPHE = "strophe"
EXTRA = "extra"
VERSES = "verses"
RHYMING_PAIRS_WITH_PATH = {}
EXCEPTIONS = []
FIND_RHYME_EXCEPTIONS = []
MAKE_RHYMING_PAIRS_EXCEPTIONS = []
MAKE_ENCIRCLING_PAIRS_EXCEPTIONS = []

In [5]:
RHYMING_PAIRS_ONLY_WITH_PATH = {}
ORDERED_RHYMING_PAIRS = []

In [6]:
def number_of_verses(poem):
    verses = get_attr(poem, VERSES)
    if verses == "unknown":
        return verses
    else:
        num = re.compile(r'^\d+')
        nmbr_of_verses = re.findall(num, verses)[0]
    return nmbr_of_verses



In [7]:
def determine_rhyming_type(poem):
    if poem.find(attrs={"name": "rhyme"}):
        rhyming_type = poem.find(attrs={"name": "rhyme"})["content"]
    elif poem.find(attrs={"name": "rime"}):
        rhyming_type = poem.find(attrs={"name": "rime"})["content"]
    else:
        rhyming_type = "unknown"
    return rhyming_type

In [8]:
def get_attr(poem, attr):
    if poem.find(attrs={"name": attr}):
        param_type = poem.find(attrs={"name": attr})["content"]
    else:
        param_type = "unknown"
    return param_type

In [9]:
def find_rhymes(poem):
    rhymes = []
    rhyme_tag = poem.find_all("rhyme-zone")
    for item in rhyme_tag:
        try:
            if isinstance(item.next_element, NavigableString):
                rhyme = item.next_element.strip(".,!?;:«»…-–—−() ").lower()
                rhymes.append(rhyme)
            else:
                rhyme = str(item.next_element.contents[0])
                rhymes.append(rhyme)
        except Exception:
            FIND_RHYME_EXCEPTIONS.append([poem, poems_with_path[poem]])
    return rhymes

In [10]:
def make_strophes(poem, n):
    lst = find_rhymes(poem)
    strophes = [lst[i:i + n] for i in range(0, len(lst), n)]
    return strophes

In [11]:
def make_rhyming_pairs_crossed_4(poem):
    n = 4
    pairs = []
    try:
        for strophe in make_strophes(poem, n):
            for i in range(2):
                pair = (strophe[i], strophe[i + 2])
                pairs.append(pair)
    except Exception:
        MAKE_RHYMING_PAIRS_EXCEPTIONS.append(poem)
    return pairs
        

In [12]:
def make_rhyming_pairs_encircling(poem):
    pairs = []
    try:
        for strophe in make_strophes(poem, 4):
            pair1 = (strophe[0], strophe[3])
            pair2 = (strophe[1], strophe[2])
            pairs.append(pair1)
            pairs.append(pair2)
    except Exception:
        MAKE_ENCIRCLING_PAIRS_EXCEPTIONS.append(poem)
    return pairs
        

In [13]:
def make_rhyming_pairs_paired(poem):
    pairs = []
    try:
        
        #if get_attr(poem, VERSES) != "unknown":
           # verses = number_of_verses(poem)
            #if not verses % 2:
        for i in range(0, len(find_rhymes(poem)), 2):
            pair = (find_rhymes(poem)[i], find_rhymes(poem)[i + 1])
            pairs.append(pair)
    except Exception:
        MAKE_RHYMING_PAIRS_EXCEPTIONS.append(poem)
    return pairs

In [14]:
def make_rhyming_pairs_even(poem):
    pairs = []
    for strophe in make_strophes(poem):
        pair = [strophe[1], strophe[3]]
        pairs.append(pair)
    return pairs

In [15]:
def make_rhyming_pairs_odd(poem):
    pairs = []
    for strophe in make_strophes(poem):
        pair = (strophe[0], strophe[2])
        pairs.append(pair)
    return pairs

In [16]:
def order_pair(pair):
    sorted_pair = sorted(pair)
    if sorted_pair == pair:
        sorted_pair.append(0)
    else:
        sorted_pair.append(1)
    return tuple(sorted_pair)

In [17]:
def get_complex_rhyming_code(poem):
    rhyming_type = determine_rhyming_type(poem)
    code_pattern = re.compile(r'\|[а-яё ]+')
    m = re.search(code_pattern, rhyming_type)
    code = m.group(0)
    code = re.sub(" ", "", code)
    code = re.sub("\|", "", code)
    return code

In [18]:
def get_indexes_from_code(rhyming_code):
    unique_letters = list(set(rhyming_code))
    list_of_index_lists = []
    for l in unique_letters:
        index_list = [index for index, char in enumerate(rhyming_code) if char == l and char != "х"]
        if index_list:
            list_of_index_lists.append(index_list)
    return list_of_index_lists

In [19]:
def make_pairs_6(poem):
    rhyming_code = get_complex_rhyming_code(poem)
    indexes = get_indexes_from_code(rhyming_code)
    pairs = []
    for strophe in make_strophes(poem, 6):
        pairs_in_strophe = []
        for lst in indexes:
            pair = []
            for i in lst:
                pair.append(strophe[i])
            pairs_in_strophe.append(pair)
        pairs.extend(pairs_in_strophe)
    return pairs

In [20]:
no_rhyme = []
unknown_rhyme = []
complex_rhyme = []
free_rhyme = []
sporadic_rhyme = []
weird_strophes = []
paired_rhyme = []
further_analysis = []

RHYMING_PAIRS = []

In [21]:
for poem in corpus:
    if determine_rhyming_type(poem) == "0":
        no_rhyme.append(poem)
    elif determine_rhyming_type(poem) == "unknown":
        unknown_rhyme.append(poem)
    elif "#" in determine_rhyming_type(poem):
        complex_rhyme.append(poem)
    elif determine_rhyming_type(poem) == "вольная":
        free_rhyme.append(poem)
    elif determine_rhyming_type(poem) == "спорадическая":
        sporadic_rhyme.append(poem)
    elif "нарушения строфики" in get_attr(poem, EXTRA):
        weird_strophes.append(poem)
    elif determine_rhyming_type(poem).lower() in "парная | аа ":
        paired_rhyme.append(poem)
    else:
        further_analysis.append(poem)
        
        
        
    
    

In [22]:
print("Нет рифмы: ", len(no_rhyme))
print("Неизвестно: ", len(unknown_rhyme))
print("Сложная рифма: ", len(complex_rhyme))
print("Вольная рифма: ", len(free_rhyme))
print("Спорадическая рифма: ", len(sporadic_rhyme))
print("Нарушения строфики: ", len(weird_strophes))
print("Парная рифма: ", len(paired_rhyme))
print("Анализируем дальше: ", len(further_analysis))

Нет рифмы:  4371
Неизвестно:  44
Сложная рифма:  1188
Вольная рифма:  11300
Спорадическая рифма:  754
Нарушения строфики:  5432
Парная рифма:  5284
Анализируем дальше:  48092


In [23]:
strophe_4 = []
further_analysis_2 = []

for poem in further_analysis:
    if get_attr(poem, STROPHE) == "4":
        strophe_4.append(poem)
    else:
        further_analysis_2.append(poem)


In [24]:
print("Строфа-4: ", len(strophe_4))
print("Едем дальше: ", len(further_analysis_2))

Строфа-4:  39937
Едем дальше:  8155


In [70]:
count = Counter()

for poem in strophe_4:
    strophe_type = determine_rhyming_type(poem)
    count[strophe_type] += 1
    

In [82]:
str4 = str(count).split(",")
stro_4 = "\n".join(str4)
with open('rhyme_in_strophe_4.txt', 'w') as f:
    f.write(stro_4)

In [25]:
strophe_4_wrong_number_of_verses = []
strophe_4_good_verses = []

for poem in strophe_4:
    if number_of_verses(poem) != "unknown":
        if int(number_of_verses(poem)) % 4:
            strophe_4_wrong_number_of_verses.append(poem)
        else:
            strophe_4_good_verses.append(poem)
    else:
        strophe_4_good_verses.append(poem)
            
                

In [26]:
print(len(strophe_4_wrong_number_of_verses))
print(len(strophe_4_good_verses))

587
39350


In [27]:
CROSSED_4_PAIRS_WITH_PATH = {}

strophe_4_continue = []
strophe_4_crossed_rhyme_types = ["перекрестная | абаб", "перекрестная",
                                   "перекрестная | амам", "перекрестная | мама", "перекрестная | тата",
                                   "перекрестная | атат", "перекрестная | арар", "перекрсетная | абаб",
                                   "перекрестная | рара", "переркестная | абаб", "перекерстная | абаб",
                                   "регулярная | абаб", "перекерстная", "перекрестный", "перекрестная | абба",
                                   "перекрестая | абаб", "перекрестные", "перкерестная", "перекретсная",
                                   "перекрестая", "перкрестная | абаб", "переркрестная | абаб",
                                   "перекрестная | ртрт", "перекрестная | рифма", "перекрестьная | абаб",
                                   "прекрестная | абаб", "перекрестная | абаб абаб", "перекрестная | рара рбрб"]

In [28]:
crossed_4_pairs = []
crossed_4_unordered_pairs = []

In [29]:
for poem in strophe_4_good_verses:
    if determine_rhyming_type(poem).strip() in strophe_4_crossed_rhyme_types:
        pairs = make_rhyming_pairs_crossed_4(poem)
        ordered_pairs = [order_pair(pair) for pair in pairs]
        CROSSED_4_PAIRS_WITH_PATH.update({poems_with_path[poem]: [poem, ordered_pairs]})
        crossed_4_pairs.extend(ordered_pairs)
        crossed_4_unordered_pairs.extend(pairs)
    else:
        strophe_4_continue.append(poem)
        
                
            
        

In [30]:
print(len(CROSSED_4_PAIRS_WITH_PATH))
print(len(crossed_4_pairs))
print(len(crossed_4_unordered_pairs))
print(len(strophe_4_continue))

35842
386180
386180
3497


In [36]:
print(crossed_4_unordered_pairs[100])

('каза̀лся', 'загора̀лся')


In [37]:
print(crossed_4_pairs[100])

('загора̀лся', 'каза̀лся', 1)


In [32]:
also_encircling_rhyme_types = ["цепная | абба ваав...", "цепная | абба бввб вггв..."]

In [165]:
encircling = []
strophe_4_further_analysis = []
for poem in strophe_4_continue:
    if "охватная" in determine_rhyming_type(poem).strip() or \
        determine_rhyming_type(poem).strip() in also_encircling_rhyme_types:
            encircling.append(poem)
    else:
        strophe_4_further_analysis.append(poem)
        
        
print(len(encircling))
print(len(strophe_4_further_analysis))
        
    
    

1728
1769


In [120]:
encircling_pairs = []
encircling_unordered_pairs = []
ENCIRCLING_PAIRS_WITH_PATH = {}

In [121]:
for poem in encircling:
    pairs = make_rhyming_pairs_encircling(poem)
    ordered_pairs = [order_pair(pair) for pair in pairs]
    ENCIRCLING_PAIRS_WITH_PATH.update({poems_with_path[poem]: [poem, ordered_pairs]})
    encircling_pairs.extend(ordered_pairs)
    encircling_unordered_pairs.extend(pairs)
   

In [122]:
print(len(encircling_pairs))
print(len(encircling_unordered_pairs))
print(len(ENCIRCLING_PAIRS_WITH_PATH))

12702
12702
1728


In [123]:
print(encircling_unordered_pairs[0])

('своѐй', 'людѐй')


In [23]:
def get_metainfo(poem):
    meta = poem.find_all('meta')
    all_names = []
    contents = []
    for item in meta:
        all_names.append(item['name'])
        contents.append([item['name'], item['content']])
    names = list(set(all_names))
    return names, contents
        

In [166]:
PAIRED_RHYME_PAIRS_WITH_PATH = {}

In [167]:
paired_rhyme_odd_number_of_verses = []
paired_rhyme_unknown_number_of_verses = []
paired_rhyme_pairs = []
paired_rhyme_unordered_pairs = []
for poem in paired_rhyme:
    if number_of_verses(poem) != "unknown":
        if int(number_of_verses(poem)) % 2:
            paired_rhyme_odd_number_of_verses.append(poem)
        else:
            pairs = make_rhyming_pairs_paired(poem)
            ordered_pairs = [order_pair(pair) for pair in pairs]
            PAIRED_RHYME_PAIRS_WITH_PATH.update({poems_with_path[poem]: [poem, ordered_pairs]})
            paired_rhyme_pairs.extend(ordered_pairs)
            paired_rhyme_unordered_pairs.extend(pairs)
    else:
        paired_rhyme_unknown_number_of_verses.append(poem)
        


In [168]:
print(len(paired_rhyme))
print(len(paired_rhyme_unknown_number_of_verses))
print(len(paired_rhyme_odd_number_of_verses))
print(len(PAIRED_RHYME_PAIRS_WITH_PATH))

5284
339
134
4800


In [169]:
len(paired_rhyme_pairs)
len(paired_rhyme_unordered_pairs)

56479

In [186]:
print(paired_rhyme_pairs[:10])

[('густы̀м', 'томѝм', 1), ('криво̀й', 'мно̀й', 1), ('волна̀', 'ясна̀', 1), ('гу̀л', 'сомкну̀л', 1), ('ѝзбрала̀', 'светла̀', 1), ('ѐй', 'кудрѐй', 1), ('свѐт', 'согрѐт', 1), ('прѝтая̀', 'я̀', 1), ('больно̀й', 'руко̀й', 1), ('еѐ', 'неѐ', 1)]


In [187]:
print(paired_rhyme_unordered_pairs[:10])

[('томѝм', 'густы̀м'), ('криво̀й', 'мно̀й'), ('ясна̀', 'волна̀'), ('гу̀л', 'сомкну̀л'), ('светла̀', 'ѝзбрала̀'), ('ѐй', 'кудрѐй'), ('согрѐт', 'свѐт'), ('я̀', 'прѝтая̀'), ('руко̀й', 'больно̀й'), ('еѐ', 'неѐ')]


In [171]:
ORDERED_RHYMING_PAIRS = paired_rhyme_pairs + crossed_4_pairs + encircling_pairs

In [172]:
len(ORDERED_RHYMING_PAIRS)

455361

In [173]:
unordered_pairs = crossed_4_unordered_pairs + paired_rhyme_unordered_pairs + encircling_unordered_pairs

In [174]:
UNORDERED_RHYMING_PAIRS = []
for pair in unordered_pairs:
    UNORDERED_RHYMING_PAIRS.append(tuple(sorted(pair)))
    
    

# put all dictionaries here!

парная рифма: PAIRED_RHYME_PAIRS_WITH_PATH: 4800 poems, 6.28 % <br>
перекрестная рифма: CROSSED_4_PAIRS_WITH_PATH: 35842 poems, 46.87% <br>
охватная рифма: ENCIRCLING_PAIRS_WITH_PATH: 1728 poems, 2.26% of the corpus

In [55]:
round(35842 / len(corpus) * 100, 2) + 6.28

53.15

In [175]:
counts = Counter(ORDERED_RHYMING_PAIRS)


In [176]:
len(counts)

277474

In [177]:
a = str(counts)


In [178]:
new_a = re.sub("Counter\(\{", "", a)
new_a = re.sub("\}\)$", "", new_a)

In [179]:
exp = re.compile("\(.+?\): \d+")
lst_a = re.findall(exp, new_a)
len(lst_a)

277474

In [180]:
rh_sorted = "\n".join(lst_a)
rh_sorted = re.sub("[)(]", "", rh_sorted)
print(rh_sorted[:200])

'дня̀', 'меня̀', 1: 622
'дѐнь', 'тѐнь', 1: 477
'нѐт', 'свѐт', 1: 474
'меня̀', 'огня̀', 1: 450
'но̀чи', 'о̀чи', 1: 410
'кро̀вь', 'любо̀вь', 1: 407
'вно̀вь', 'любо̀вь', 1: 384
'на̀с', 'ча̀с', 1: 360


In [181]:
with open('ordered_rhyming_pairs_by_frequency_55.txt', 'w') as f:
    f.write(rh_sorted)

In [182]:
ordered_counter = Counter(ORDERED_RHYMING_PAIRS)
with open("ordered_rhyming_pairs_by_freq_55_2.txt", 'w') as f:
    for k, v in  ordered_counter.most_common():
        f.write( "{} {}\n".format(v, k) )

In [183]:
ordered_counter.most_common()[0]

(('дня̀', 'меня̀', 1), 622)

In [184]:
unordered_counter = Counter(UNORDERED_RHYMING_PAIRS)
with open("unordered_rhyming_pairs_by_freq_55.txt", "w") as f:
    for k, v in unordered_counter.most_common():
        f.write("{} {}\n".format(v, k))

In [185]:
unordered_counter.most_common()[0]

(('дня̀', 'меня̀'), 622)

In [80]:
pairs_jdict = json.dumps(RHYMING_PAIRS_ONLY_WITH_PATH)

In [83]:
with open("rhyming_pairs_with_path.json", 'w') as f:
    f.write(pairs_jdict)

In [40]:
#print(RHYMING_PAIRS_WITH_PATH[poems_with_path[corpus[5588]]])

In [23]:
rhymes_tuples_list = []# рифмованные кортежи
paths_list = []# пути до файлов
poems_list = []# сами стихотворения
for key in RHYMING_PAIRS_WITH_PATH:
    paths_list.append(key)
    rhymes_tuples_list.append(RHYMING_PAIRS_WITH_PATH[key][1])
    poems_list.append(RHYMING_PAIRS_WITH_PATH[key][0])
    

In [24]:
print(rhymes_tuples_list[0])


[('густы̀м', 'томѝм', 1), ('криво̀й', 'мно̀й', 0), ('волна̀', 'ясна̀', 1), ('гу̀л', 'сомкну̀л', 0), ('ѝзбрала̀', 'светла̀', 1), ('ѐй', 'кудрѐй', 0), ('свѐт', 'согрѐт', 1), ('прѝтая̀', 'я̀', 1), ('больно̀й', 'руко̀й', 1), ('еѐ', 'неѐ', 0), ('гру̀дь', 'пу̀ть', 0), ('ждѐт', 'наро̀д', 0), ('су̀еты̀', 'ты̀', 1), ('погря̀з', 'ча̀с', 1), ('зако̀н', 'сто̀н', 1), ('го̀лово̀й', 'тобо̀й', 1), ('вѐрь', 'двѐрь', 0), ('левѝт', 'про̀звучѝт', 0), ('за̀падѐт', 'пло̀д', 0), ('жда̀ть', 'страда̀ть', 0), ('лу̀ч', 'ту̀ч', 0), ('моѐй', 'скорбѐй', 0), ('она̀', 'сна̀', 0), ('сѝл', 'служѝл', 0), ('вно̀вь', 'любо̀вь', 0)]


5267


In [27]:
def normal_letter(word):
    word = word.replace(chr(768), '')
    word = word.replace(chr(180), '')
    word = word.replace("ѝ", "и")
    word = word.replace("ѐ", "е")
    word = word.replace("Ѐ", "Е")
    word = word.replace("Ѝ", "И")
    return word

In [69]:
#print(pos_tag([normal_letter(word) for word in word_tokenize(poems_list[0].get_text())], lang='rus'))

In [28]:
rhymes_lists_list = []
number_of_rhymes_list = []
for r in rhymes_tuples_list:
    rhyming_words = []
    for tupl in r:
        words = tupl[:2]
        rhyming_words.extend(words)
    rhymes_lists_list.append(rhyming_words)
    number_of_rhymes_list.append(len(rhyming_words))
    
        



['густы̀м', 'томѝм', 'криво̀й', 'мно̀й', 'волна̀', 'ясна̀', 'гу̀л', 'сомкну̀л', 'ѝзбрала̀', 'светла̀', 'ѐй', 'кудрѐй', 'свѐт', 'согрѐт', 'прѝтая̀', 'я̀', 'больно̀й', 'руко̀й', 'еѐ', 'неѐ', 'гру̀дь', 'пу̀ть', 'ждѐт', 'наро̀д', 'су̀еты̀', 'ты̀', 'погря̀з', 'ча̀с', 'зако̀н', 'сто̀н', 'го̀лово̀й', 'тобо̀й', 'вѐрь', 'двѐрь', 'левѝт', 'про̀звучѝт', 'за̀падѐт', 'пло̀д', 'жда̀ть', 'страда̀ть', 'лу̀ч', 'ту̀ч', 'моѐй', 'скорбѐй', 'она̀', 'сна̀', 'сѝл', 'служѝл', 'вно̀вь', 'любо̀вь']


In [29]:
print(rhymes_lists_list[6])
print(number_of_rhymes_list[6])
print(len(rhymes_lists_list))
print(len(number_of_rhymes_list))

['зарѐю', 'сохо̀ю', 'нѐбеса̀м', 'та̀м', 'о̀роша̀ет', 'страда̀ет', 'весна̀', 'она̀', 'весѐлой', 'тяжѐлой', 'ла̀д', 'лежа̀т', 'но̀чи', 'о̀чи', 'бѐдняка̀', 'рука̀', 'могѝлы', 'сѝлы']
18
5267
5267


In [70]:

number_of_nouns_list = []
number_of_verbs_list = []
number_of_adjectives_list = []
number_of_words_list = []
for poem in poems_list:
    noun_counter = 0
    verb_counter = 0
    adj_counter = 0
    normalized_letters = [normal_letter(word) for word in word_tokenize(poem.get_text())]
    for word in normalized_letters:
        if not word:
            normalized_letters.remove(word)
    pos_tagged = pos_tag(normalized_letters, lang='rus')
    for word in pos_tagged:
        if word[1] == "S":
            noun_counter += 1
        elif word[1] == "V":
            verb_counter += 1
        elif word[1].startswith("A="):
            adj_counter += 1
        elif word[1] == "NONLEX":
            pos_tagged.remove(word)
        else:
            pass
    
    number_of_nouns_list.append(noun_counter)
    number_of_verbs_list.append(verb_counter)
    number_of_adjectives_list.append(adj_counter)
    number_of_words_list.append(len(pos_tagged))
print(len(number_of_nouns_list))
print(len(number_of_verbs_list))
print(len(number_of_adjectives_list))
print(len(number_of_words_list))

5267
5267
5267
5267


In [76]:
print(rhymes_lists_list[0])

['густы̀м', 'томѝм', 'криво̀й', 'мно̀й', 'волна̀', 'ясна̀', 'гу̀л', 'сомкну̀л', 'ѝзбрала̀', 'светла̀', 'ѐй', 'кудрѐй', 'свѐт', 'согрѐт', 'прѝтая̀', 'я̀', 'больно̀й', 'руко̀й', 'еѐ', 'неѐ', 'гру̀дь', 'пу̀ть', 'ждѐт', 'наро̀д', 'су̀еты̀', 'ты̀', 'погря̀з', 'ча̀с', 'зако̀н', 'сто̀н', 'го̀лово̀й', 'тобо̀й', 'вѐрь', 'двѐрь', 'левѝт', 'про̀звучѝт', 'за̀падѐт', 'пло̀д', 'жда̀ть', 'страда̀ть', 'лу̀ч', 'ту̀ч', 'моѐй', 'скорбѐй', 'она̀', 'сна̀', 'сѝл', 'служѝл', 'вно̀вь', 'любо̀вь']


In [77]:
number_of_rhyming_nouns_list = []
number_of_rhyming_verbs_list = []
number_of_rhyming_adjectives_list = []

for sublist in rhymes_lists_list:
    rhyming_noun_counter = 0
    rhyming_verb_counter = 0
    rhyming_adj_counter = 0
    normalized = [normal_letter(word) for word in sublist]
    for word in normalized:
        if not word:
            normalized.remove(word)
    pos_tagged = pos_tag(normalized, lang='rus')
    for w in pos_tagged:
        if w[1] == "S":
            rhyming_noun_counter += 1
        elif w[1] == "V":
            rhyming_verb_counter += 1
        elif w[1].startswith("A="):
            rhyming_adj_counter += 1
        else:
            pass
    number_of_rhyming_nouns_list.append(rhyming_noun_counter)
    number_of_rhyming_verbs_list.append(rhyming_verb_counter)
    number_of_rhyming_adjectives_list.append(rhyming_adj_counter)
    
        

mydf = pd.DataFrame({'Column1': range(1, 4), 'Column2': ['one', 'two', 'three']})<br>

Dataframe:<br>
Column_1 -  'poem_id': range(len(paths_list))<br>
Column_2 - 'century': century_list <br>
Column_3 - 'words': number_of_words_list <br>
Column_4 - 'rhymes': number_of_rhymes_list <br>
Column_5 - 'nouns': number_of_nouns_list <br>
Column_6 - 'verbs': number_of_verbs_list <br>
Column_7 - 'adjectives': number_of_adjectives_list <br>
Column_8 - 'rhyming_nouns': number_of_rhyming_nouns_list <br>
ДАЛЬШЕ: etc

In [81]:
century_list = []# век
for path in paths_list:
    if path.startswith('xix'):
        century = 'xix'
    else:
        century = 'xx'
    century_list.append(century)

        


In [None]:
print(len(century_list))

In [84]:
project_data = pd.DataFrame({'poem_id': range(len(paths_list)),
                            'century': century_list,
                            'words': number_of_words_list,
                            'rhymes': number_of_rhymes_list,
                            'nouns': number_of_nouns_list,
                            'verbs': number_of_verbs_list,
                            'adjectives': number_of_adjectives_list,
                            'rhyming_nouns': number_of_rhyming_nouns_list,
                            'rhyming_verbs': number_of_rhyming_verbs_list,
                            'rhyming_adjectives': number_of_rhyming_adjectives_list})

In [86]:
project_data.to_csv("poetic.csv")

In [20]:
complete_rhyme_types_list = []
for poem in corpus:
    complete_rhyme_types_list.append(determine_rhyming_type(poem))

count = Counter()
for t in complete_rhyme_types_list:
    count[t] +=1
    
rh = str(count).split(",")
complete_rh = "\n".join(rh)
with open('complete_rhyming_types.txt', 'w') as f:
    f.write(complete_rh)

# Здесь всё, что мы пока обработать не можем

## paired_rhyme_odd_number_of_verses: 134
## Неизвестно:  44
## Вольная рифма:  11300
## Спорадическая рифма:  754
## Нарушения строфики:  5698
## paired_rhyme_unknown_number_of_verses: 339
## paired_rhyme_odd_number_of_verses: 134
## strophe_4_wrong_number_of_verses: 587
## EXCEPTIONS
