# MAPRR Textual Analytics

## Intro

### Import

In [1]:
import os 
import pandas as pd 
import re
import json
#from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, NewsSyntaxParser, NewsNERTagger, PER, NamesExtractor, Doc

In [2]:
repo = '/home/xtra/Documents/MPGRR/repos/xml/'
pathList = [x for x in os.listdir(repo)]

## Pre-processing Function

In [3]:
def processXML(file): 
    
    repo = '/home/xtra/Documents/MPGRR/repos/xml/'
    file_path = repo + file

    with open(file_path,'r') as f: rawXML = f.read() 

    linePattern = re.compile(r"<l>(([А-Яа-я]).*)</l>")
    paraPattern = re.compile(r"<p>(.*)</p>*?", re.DOTALL)
    titlPattern = re.compile(r"<title>([А-Яа-яA-Za-z].*)</title>")
    authPattern = re.compile(r"<author>([А-ЯA-Z].*)</author>")
    datePattern = re.compile(r"<publicationStmt>(19\d{2})</publicationStmt>")
    tagsPattern = re.compile(r"<.*>")
    wkidPattern = re.compile(r'xml:id="W(\d{4})">')
    stufPattern = re.compile(r'-|_|<|>|—')
    pathPattern = re.compile(r"^W(\d{4})_(.*)_(.*)_(\d{4})")

    """work_id = wkidPattern.findall(rawXML)[0]
    title = titlPattern.findall(rawXML)
    author = authPattern.findall(rawXML)
    year = datePattern.findall(rawXML)"""
    
    work_id = pathPattern.match(file).group(1)
    title = pathPattern.match(file).group(2)
    author = pathPattern.match(file).group(3)
    year = pathPattern.match(file).group(4)
    
    p = len(paraPattern.findall(rawXML))
    #print('paras: '+str(p))
    l = len(linePattern.findall(rawXML)) 
    #print('lines: '+str(l))

    if p > l: 
        lines = [] 
        text = ""
        genre = 'prose'
        
        lines = paraPattern.findall(rawXML)
        #print(lines)
        lines = lines[0].split('</p>\n')
        #print(lines)
        lines = [re.sub(tagsPattern, '', i) for i in lines]
        #print(lines)
        lines = [re.sub(stufPattern, '', i) for i in lines]
        #print(lines)
        lines = [i.replace('  ','') for i in lines]
        #print(lines)
        lines = [i.replace('\n','') for i in lines] 
        #print(lines)
        
        text = ''.join(lines)
        
    else: 
        poemIter = linePattern.finditer(rawXML)
        lines = []
        text = ""
        
        genre = 'poetry'
        for line in poemIter: 
            #print(line)
            txt = str(line.group(1))
            #print(text)
            txt = re.sub(tagsPattern, '', txt)
            #print(text)
            lines.append(re.sub(tagsPattern, '', txt))
            #print(text)
            text += txt + ' '
            
    
    subDict = {work_id: {'title': title,
                         'year': year,
                         'author': author,
                         'num_lps': len(lines), 
                         'num_words': len(text),
                         'genre': genre,
                         'text': text}
              }   
    
    return subDict

In [4]:
XMLdict = {}
for file in pathList: 
    #print(file)
    XMLdict.update(processXML(file=file))

#XMLdict

## Dataframe Split

In [5]:
metaDf = pd.DataFrame.from_dict(XMLdict, orient='index').rename_axis('work_num').sort_index(inplace=False)
worksDf = metaDf[['title','year','author','genre']]
textsDf = metaDf[['text','num_lps','num_words']]
authorsDf = metaDf.reset_index().groupby('author').size().to_frame().rename(columns={0:'num_works'})
authorsDf[['num_lps','num_words']] = metaDf.reset_index().groupby('author').sum()

In [6]:
authorsDf

Unnamed: 0_level_0,num_works,num_lps,num_words
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Akhmatova,30,421,12201
Aleksandrovskii,2,190,4804
Artamonov,9,108,2950
Aseev,11,416,9536
Averchenko,9,527,58660
...,...,...,...
Virganskii,6,69,1751
Vlasov-Okskii,5,85,2345
godinu,8,233,5862
kii,1,21,6655


In [7]:
worksDf

Unnamed: 0_level_0,title,year,author,genre
work_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0001,Srazu_stalo_tikho,1921,Akhmatova,poetry
0002,Ty_otstupnik,1921,Akhmatova,poetry
0003,Prosypatsia_na_rassvete,1921,Akhmatova,poetry
0004,I_v_tainuiu_druzhbu,1921,Akhmatova,poetry
0005,Slovno_angel,1921,Akhmatova,poetry
...,...,...,...,...
0599,Liubov_raspiali_na_kreste,1919,Viatkin,poetry
0600,Na_slovakh,1917,Gorkii,prose
0601,Posledniaia_prosba,1915,Kolchin,poetry
0602,I_rek_Sidiashchii,1916,Did,poetry


In [8]:
textsDf

Unnamed: 0_level_0,text,num_lps,num_words
work_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0001,"Сразу стало тихо в , Облетел последний мак, ...",12,296
0002,"Ты — отступник: за остров зелёный Отдал, отда...",20,573
0003,"Просыпаться на рассвете Оттого, что радость д...",12,309
0004,"И в тайную дружбу с высоким, Как юный орёл те...",7,190
0005,"Словно ангел, возмутивший воду, Ты взглянул т...",7,218
...,...,...,...
0599,"Любовь распяли на кресте, Но в третий день она...",12,311
0600,"На словахвсе согласны, что российское государс...",30,6264
0601,"Сестра!.. Сестрица, на минутку подойдите Ко мн...",14,482
0602,"В борьбе с врагом, в борьбе кровавой, геройско...",4,211
