# Primary Text Processing

## Import

### Libraries

In [1]:
import os 
import codecs
from lxml import etree

from backend import *

### Definitions

In [2]:
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(morph_vocab)

In [3]:
texts = '../texts/fiction/'

libCols = ['author','pub_year','title','text']
tokenOHCO = ['w_id','part_num','para_num', 'sent_num', 'token_num']
tokenCols = ['p_id', 'start', 'stop', 'text', 'token_id', 'head_id', 'rel', 'pos', 'lemma', 'anim', 'aspect', 'case', 'degree', 'gender', 'mood', 'number', 'person', 'tense', 'verb_form', 'voice']

## Primary Texts

### Library

In [4]:
libDf = pd.DataFrame(columns = libCols)
for t in os.listdir(texts): 
    if t[-4:] == '.txt': 
        #print(t)
        info = re.match(r'(\w+)-(\d{4})-(.+).txt', t)
        with codecs.open(texts+t, 'r', encoding='windows-1251') as f: 
            textytext = f.read()
        libDf = libDf.append({
            'author': info.group(1),
            'pub_year': int(info.group(2)), 
            'title': info.group(3), 
            'text': textytext
        }, ignore_index=True)
        
libDf = libDf.sort_values(libCols[1:3]).reset_index().drop(['index'], axis=1)
libDf.index.name = 'w_id'
textDf = libDf[[libCols[3]]]
libDf = libDf.drop(columns=[libCols[3]])
libDf

Unnamed: 0_level_0,author,pub_year,title
w_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,gorkii,1900,troe
1,andreev,1903,zhizn-vasiliia-fiveiskogo
2,andreev,1904,gubernator
3,andreev,1905,k-zvezdam
4,andreev,1905,khristiane
5,andreev,1905,tak-bylo
6,gorkii,1906,mat
7,andreev,1906,savva-ignis-sanat
8,andreev,1907,iuda-iskariot
9,andreev,1907,zhizn-cheloveka


In [5]:
motherDf = textDf.iloc[[6]]
motherDf = pd.DataFrame(data=motherDf.text.str.split(r'\n\n').to_list()[0])
chapTitles = motherDf.iloc[::2][0].to_list()
chapTexts = motherDf.iloc[1::2][0].to_list()
motherDf = pd.DataFrame(data={'chap':chapTitles, 'text':chapTexts})
motherDf.index = range(1,59)
motherDf.chap = motherDf.chap.str.replace('\W', '', regex=True)
#motherDf['part'] = ['1' if chap < 29 else '2' for chap in range(len(motherDf.chap))]
motherDf

Unnamed: 0,chap,text
1,I,"Каждый день над рабочей слободкой, в ды..."
2,II,"Так жил и Михаил Власов, слесарь, волос..."
3,III,"Спустя недели две после смерти отца, в ..."
4,IV,Однажды после ужина Павел опустил занав...
5,V,"И снова они стали жить молча, далекие и..."
6,VI,"Самовар вскипел, мать внесла его в комн..."
7,VII,"Дни скользили один за другим, как бусы ..."
8,VIII,Маленький дом на окраине слободки будил...
9,IX,"В слободке говорили о социалистах, кото..."
10,X,Они явились почти через месяц после тре...


In [28]:
motherXmlDf = motherDf['text'].str.split(' \n', expand=True).stack().to_frame().reset_index().rename(columns={'level_0':'chapID','level_1':'para',0:'text'})
motherXmlDf = motherXmlDf.loc[~motherXmlDf.text.str.contains(r"^\W*$", regex=True)]
motherXmlDf['text'] = motherXmlDf.text.str.replace('\n|\s{2,}', '')
motherXmlDf['part'] = motherXmlDf.chapID.apply(lambda x: int('1') if x < 29 else int('2'))
motherXmlDf['chap'] = motherXmlDf.chapID.map(motherDf['chap'].to_dict())
motherXmlDf['para'] = motherXmlDf['para'].apply(lambda x: x+1)
motherXmlDf['paraID'] = range(1, len(motherXmlDf)+1)
motherXmlDf = motherXmlDf.reset_index().drop('index', axis=1)

motherXmlDf

  motherXmlDf['text'] = motherXmlDf.text.str.replace('\n|\s{2,}', '')


Unnamed: 0,chapID,para,text,part,chap,paraID
0,1,1,"Каждый день над рабочей слободкой, в дымном, м...",1,I,1
1,1,2,"Вечером, когда садилось солнце, и на стеклах д...",1,I,2
2,1,3,"День проглочен фабрикой, машины высосали из му...",1,I,3
3,1,4,"По праздникам спали часов до десяти, потом люд...",1,I,4
4,1,5,"Усталость, накопленная годами, лишала людей ап...",1,I,5
...,...,...,...,...,...,...
4431,58,90,Ударили по руке.,2,XXIX,4432
4432,58,91,"-- Только злобы накопите, безумные! На вас она...",2,XXIX,4433
4433,58,92,Жандарм схватил ее за горло и стал душить. Она...,2,XXIX,4434
4434,58,93,-- Несчастные...,2,XXIX,4435


### Make XML

In [29]:
root = etree.Element("prose")
print(root.tag)
pt = ch = pa = 0
nameDict = motherDf.chap.to_dict()
for part in motherXmlDf.part.unique():
    print(f"Part {part}")
    root.append(etree.Element("part", n=str(part)))
    chapList = motherXmlDf.loc[motherXmlDf['part'] == part].chapID.unique()
    #print(chapList)
    for chap in chapList:
        print(f"Chap {chap}")
        root[pt].append(etree.Element("chapter", n=str(ch+1), name=nameDict.get(chap)))
        paraList = motherXmlDf.loc[(motherXmlDf['part'] == part) & (motherXmlDf['chapID'] == chap)].index
        #print(paraList)
        for paragraph in paraList:
            #print(f"Paragraph {paragraph}")
            root[pt][ch].append(etree.Element("paragraph", n=str(pa+1), name=str(pa+1)))
            paraText = motherXmlDf.loc[paragraph].text
            #print(f"paraText: {paraText}")
            #print(f"pt = {pt}; ch = {ch}; paragraph = {paragraph}")
            root[pt][ch][pa].text = paraText
            pa+=1
        pa=0
        ch+=1
    ch=0
    pt+=1
#t+=1
#print(etree.tostring(root, pretty_print=True, xml_declaration=True))
etree.ElementTree(root).write('../site/texts/mother.xml', pretty_print=True, xml_declaration=True, encoding='windows-1251')

prose
Part 1
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28]
Chap 1
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
Paragraph 0
pt = 0; ch = 0; paragraph = 0
Paragraph 1
pt = 0; ch = 0; paragraph = 1
Paragraph 2
pt = 0; ch = 0; paragraph = 2
Paragraph 3
pt = 0; ch = 0; paragraph = 3
Paragraph 4
pt = 0; ch = 0; paragraph = 4
Paragraph 5
pt = 0; ch = 0; paragraph = 5
Paragraph 6
pt = 0; ch = 0; paragraph = 6
Paragraph 7
pt = 0; ch = 0; paragraph = 7
Paragraph 8
pt = 0; ch = 0; paragraph = 8
Paragraph 9
pt = 0; ch = 0; paragraph = 9
Paragraph 10
pt = 0; ch = 0; paragraph = 10
Paragraph 11
pt = 0; ch = 0; paragraph = 11
Paragraph 12
pt = 0; ch = 0; paragraph = 12
Paragraph 13
pt = 0; ch = 0; paragraph = 13
Chap 2
Int64Index([14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
            31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42],
           dtype='int64')
Paragraph 14
pt = 0; ch = 1; paragraph = 14
Parag

In [60]:
root = etree.Element("prose")
print(root.tag)
pt = ch = pa = 1
#TestList = [x for x in bibliiaDf.test.unique()]
#for test in range(len(TestList)): 
#    root.append(etree.Element("t", n=str(t), name=TestList[test]))

for part in motherXmlDf.part.unique(): 
    print(f"Part {part}")
    ChapList = list(motherXmlDf.loc[motherXmlDf['part'] == str(part)].index)
    for chap in ChapList:
        print(f"Chap {chap}")
        root[int(part)-1].append(etree.Element("chap", n=str(ch), name=str(chap)))
        ParaList = list(motherXmlDf.loc[(motherXmlDf['part'] == part) & (motherXmlDf['chap'] == chap)].index)
        for para in ParaList:
            print(f"Para {para}")
            root[part][chap].append(etree.Element("para", n=str(pa), name=str(para)))
            para_text = motherXmlDf.loc[pa].text
            print(f"Paratext: {para_text}")
            

prose
Part 1
Chap 0
Chap 1
Chap 2
Chap 3
Chap 4
Chap 5
Chap 6
Chap 7
Chap 8
Chap 9
Chap 10
Chap 11
Chap 12
Chap 13
Chap 14
Chap 15
Chap 16
Chap 17
Chap 18
Chap 19
Chap 20
Chap 21
Chap 22
Chap 23
Chap 24
Chap 25
Chap 26
Chap 27
Chap 28
Chap 29
Chap 30
Chap 31
Chap 32
Chap 33
Chap 34
Chap 35
Chap 36
Chap 37
Chap 38
Chap 39
Chap 40
Chap 41
Chap 42
Chap 43
Chap 44
Chap 45
Chap 46
Chap 47
Chap 48
Chap 49
Chap 50
Chap 51
Chap 52
Chap 53
Chap 54
Chap 55
Chap 56
Chap 57
Chap 58
Chap 59
Chap 60
Chap 61
Chap 62
Chap 63
Chap 64
Chap 65
Chap 66
Chap 67
Chap 68
Chap 69
Chap 70
Chap 71
Chap 72
Chap 73
Chap 74
Chap 75
Chap 76
Chap 77
Chap 78
Chap 79
Chap 80
Chap 81
Chap 82
Chap 83
Chap 84
Chap 85
Chap 86
Chap 87
Chap 88
Chap 89
Chap 90
Chap 91
Chap 92
Chap 93
Chap 94
Chap 95
Chap 96
Chap 97
Chap 98
Chap 99
Chap 100
Chap 101
Chap 102
Chap 103
Chap 104
Chap 105
Chap 106
Chap 107
Chap 108
Chap 109
Chap 110
Chap 111
Chap 112
Chap 113
Chap 114
Chap 115
Chap 116
Chap 117
Chap 118
Chap 119
Chap 120
Chap 121

### Tokens

In [None]:
tokenDf = textDf.text.str.split('\n\n\n', expand=True).stack().to_frame()
tokenDf.index.names = tokenOHCO[:2]
tokenDf = tokenDf.rename(columns={0:'parts'})
tokenDf = tokenDf.parts.str.split('\n\s*\n', expand=True).stack().to_frame()
tokenDf.index.names = tokenOHCO[:3]
tokenDf = tokenDf.rename(columns={0:'paras'})
tokenDf = tokenDf.paras.str.split(r'!|\?|\.|;', expand=True).stack().to_frame()
tokenDf.index.names = tokenOHCO[:4]
#tokenDf 
sentDf = tokenDf = tokenDf.rename(columns={0:'sents'})
tokenDf = tokenDf.sents.str.split(r'\n|\s+', expand=True).stack().to_frame()
tokenDf.index.names = tokenOHCO
tokenDf = tokenDf.rename(columns={0:'token'})
tokenDf = tokenDf.token.apply(lambda x: x.lower().strip(r"\W\|\]\}\[\{\.\'\"\?;:,<>/1234567890")).to_frame()
tokenDf = tokenDf[tokenDf['token'].str.contains(r'\w+')]
tokenDf

In [None]:
tokenDf.token.value_counts().to_frame().reset_index().iloc[:60]

In [None]:
sentDf.sents.apply(lambda x: x.strip(r"--")).to_frame()