# Primary Text Processing

## Import

### Libraries

In [1]:
import os 
import codecs
from lxml import etree

from backend import *

### Definitions

In [2]:
texts = '../texts/fiction/'

libCols = ['author','pub_year','title','text']
tokenOHCO = ['w_id','part_num','para_num', 'sent_num', 'token_num']
tokenCols = ['p_id', 'start', 'stop', 'text', 'token_id', 'head_id', 'rel', 'pos', 'lemma', 'anim', 'aspect', 'case', 'degree', 'gender', 'mood', 'number', 'person', 'tense', 'verb_form', 'voice']

## Primary Texts

### Library

In [3]:
libDf = pd.DataFrame(columns = libCols)
for t in os.listdir(texts): 
    if t[-4:] == '.txt': 
        #print(t)
        info = re.match(r'(\w+)-(\d{4})-(.+).txt', t)
        with codecs.open(texts+t, 'r', encoding='windows-1251') as f: 
            textytext = f.read()
        libDf = libDf.append({
            'author': info.group(1),
            'pub_year': int(info.group(2)), 
            'title': info.group(3), 
            'text': textytext
        }, ignore_index=True)
        
libDf = libDf.sort_values(libCols[1:3]).reset_index().drop(['index'], axis=1)
libDf.index.name = 'w_id'
libTextsDf = libDf[[libCols[3]]]
libDf = libDf.drop(columns=[libCols[3]])
libDf

Unnamed: 0_level_0,author,pub_year,title
w_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,gorkii,1900,troe
1,andreev,1903,zhizn-vasiliia-fiveiskogo
2,andreev,1904,gubernator
3,andreev,1905,k-zvezdam
4,andreev,1905,khristiane
5,andreev,1905,tak-bylo
6,gorkii,1906,mat
7,andreev,1906,savva-ignis-sanat
8,andreev,1907,iuda-iskariot
9,andreev,1907,zhizn-cheloveka


In [4]:
libTextsDf

Unnamed: 0_level_0,text
w_id,Unnamed: 1_level_1
0,Среди лесов Керженца рассеяно много оди...
1,*I*\n\n Над всей жизнью Василия ...
2,I \n\n \n ...
3,1\n \n Обсерват...
4,За окнами падал мокрый нояб...
5,1\n\n Стояла на площади огромная чер...
6,*I*\n\n Каждый день над рабочей слоб...
7,1\n \t \n В...
8,1\n \n Иисуса Христа много раз...
9,"0\n\n/*Некто в сером*, именуемый *Он*, говорит..."


In [5]:
confessionDf = textRegularize(libTextsDf, 10)
#confessionDf

  textDf['text'] = textDf.text.str.replace('\n|\s{2,}', '')


In [6]:
dpDf = textRegularize(libTextsDf, 14)
#dpDf

In [7]:
motherTextDf = textRegularize(libTextsDf, 6)
#motherTextDf

In [8]:
# split into chapters
detstvoTextDf = libTextsDf.loc[[22]]
detstvoTextDf = pd.DataFrame(data=detstvoTextDf.text.str.split(r'\*\w+\*\n\n').to_list()[0]).reset_index()
detstvoTextDf = detstvoTextDf.rename(columns={'index':'chap', 0:'text'})[1:]
detstvoTextDf = detstvoTextDf.text.str.split('\n\n', expand=True).stack().to_frame()
#detstvoTextDf

### Tokens

In [9]:
motherTokenDf = pd.read_pickle('./proc/MotherTokendf.pkl')

In [10]:
motherTokenDf.set_index(['p_id','token_id'])

Unnamed: 0_level_0,Unnamed: 1_level_0,start,stop,text,head_id,rel,pos,lemma,anim,aspect,case,degree,gender,mood,number,person,tense,verb_form,voice
p_id,token_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1_1,0,6,Каждый,1_2,det,DET,каждый,,,Acc,,Masc,,Sing,,,,
1,1_2,7,11,день,1_27,obl,NOUN,день,Inan,,Acc,,Masc,,Sing,,,,
1,1_3,12,15,над,1_5,case,ADP,над,,,,,,,,,,,
1,1_4,16,23,рабочей,1_5,amod,ADJ,рабочий,,,Ins,Pos,Fem,,Sing,,,,
1,1_5,24,33,слободкой,1_27,obl,NOUN,слободка,Inan,,Ins,,Fem,,Sing,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4436,1_1,0,6,Кто-то,1_2,nsubj,PRON,кто-то,,,Nom,,,,,,,,
4436,1_2,7,14,ответил,1_0,root,VERB,ответить,,Perf,,,Masc,Ind,Sing,,Past,Fin,Act
4436,1_3,15,17,ей,1_2,iobj,PRON,она,,,Dat,,Fem,,Sing,3,,,
4436,1_4,18,25,громким,1_5,amod,ADJ,громкий,,,Dat,Pos,,,Plur,,,,


In [11]:
pwd

'/home/xtra/code/diss/codebase'

In [13]:
from navec import Navec

path = './models/navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)