# MAPRR Textual Analytics

## Intro

### Import

In [2]:
import os 
import time
import logging
import pandas as pd 
import re
import threading
import requests
from bs4 import BeautifulSoup
from natasha import (
    Segmenter, 
    MorphVocab, 
    NewsEmbedding, 
    NewsMorphTagger, 
    NewsSyntaxParser, 
    NewsNERTagger, 
    PER, 
    NamesExtractor, 
    Doc)
from razdel import tokenize

In [3]:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

## Pre-processing Function

In [4]:
domain = 'https://mpgrr.herokuapp.com/'

In [5]:
tables = {'agents/': 304,
          'works/': 603, 
          'place_based_concepts/': 315, 
          'locations/': 366, 
          'multivalent_markers/': 433}

In [90]:
class maprr: 
    
    def __init__(self): 
        self.Wsoup = {} 
        self.Asoup = {}
        self.Ws = {}
        self.As = {}
    
    def get_htmlA(self): 
        # list(tables.values())[0]
        for i in range(1, list(tables.values())[0]):
            url = domain+list(tables.keys())[0]+str(i)
            with requests.get(url) as r: 
                if r.status_code == 200: 
                    s = BeautifulSoup(r.content, 'html.parser')
                    self.Asoup.update({i:s})
                    time.sleep(.1)
                else: 
                    logging.info(f"A{i} status code: {r.status_code}")
                    time.sleep(.1)
                    pass

    def get_htmlW(self): 
        # list(tables.values())[1]
        for i in range(1, list(tables.values())[1]):
            url = domain+list(tables.keys())[1]+str(i)
            with requests.get(url) as r: 
                if r.status_code == 200: 
                    s = BeautifulSoup(r.content, 'html.parser')
                    self.Wsoup.update({i:s})
                    time.sleep(.1)
                else: 
                    logging.info(f"W{i} status code: {r.status_code}")
                    time.sleep(.1)
                    pass

    def parseWs(self, html): 
        content = html.find('div', {'class':'col-md-9 fixed-height'})
        try: 
            author = content.div.h3.text
        except: 
            author = ""
        try: 
            title = content.div.h4.text
        except: 
            title = ""
        text = content.find_all('p',{'class':'stanza'})
        Wtext = [x.text.replace('\n','').strip() for x in text]
        metaKeys = [x.text[:-1] for x in html.find('div', {'class':'card-body'}).find_all('h4')]
        metaVals = [x.text for x in html.find('div', {'class':'card-body'}).find_all('p')]
        metaDict = dict(zip(metaKeys, metaVals))
        subDict = {'title': title, 
                   'text': Wtext}
        subDict.update(metaDict)
        #self.Ws.update(subdict)
        return subDict

    def parseAs(self, html): 
        name = html.find('div', {'class': 'card scrollable'}).h2.text
        bdate, ddate = html.find('div', {'class': 'card scrollable'}).span.text.split(' - ')
        subDict = {'name': name, 'birth': bdate, 'death': ddate}
        try: 
            typeKeys = [x.h4.text for x in html.find_all('div', {'class': 'col-md-4'})]
            typeVals = [x.p or x.div.span.text for x in html.find_all('div', {'class': 'col-md-4'})]
            typeVals[:2] = [x.text for x in typeVals[:2]]
            typeDict = dict(zip(typeKeys, typeVals))
            subDict.update(typeDict)
        except: 
            pass
        #self.As.update(subdict)
        return subDict

    def run(self): 
        self.get_htmlA() 
        self.get_htmlW() 

        self.As = {k: self.parseAs(v) for k, v in self.Asoup.items()}
        self.Ws = {k: self.parseWs(v) for k, v in self.Wsoup.items()}

        AsDf = pd.DataFrame.from_dict(self.As, orient='index')
        WsDf = pd.DataFrame.from_dict(self.Ws, orient='index')        
        
        WsDf.to_json('WsDf.json')
        AsDf.to_json('AsDf.json')

In [91]:
%%time
if __name__ == '__main__': 
    maprr().run()

INFO:A45 status code: 500
INFO:A74 status code: 404
INFO:A75 status code: 404
INFO:A76 status code: 404
INFO:A77 status code: 404
INFO:A139 status code: 404
INFO:A140 status code: 404
INFO:A192 status code: 404
INFO:A206 status code: 404
INFO:A252 status code: 404
INFO:A271 status code: 500
INFO:A288 status code: 500
INFO:A290 status code: 500
INFO:A295 status code: 500
INFO:W173 status code: 500
INFO:W179 status code: 500
INFO:W261 status code: 500
INFO:W306 status code: 500
INFO:W425 status code: 500
INFO:W433 status code: 500
INFO:W434 status code: 500
INFO:W435 status code: 500
INFO:W439 status code: 500
INFO:W526 status code: 500
INFO:W554 status code: 500
INFO:W576 status code: 500
INFO:W577 status code: 500
INFO:W578 status code: 500
INFO:W579 status code: 500
INFO:W581 status code: 500
INFO:W598 status code: 500


CPU times: user 1min 19s, sys: 1.33 s, total: 1min 20s
Wall time: 15min 13s


In [300]:
libDf = pd.read_json('WsDf.json')
libDf.index.name = 'W#'
libDf['num_words'] = libDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))
libDf.sample(5)

Unnamed: 0_level_0,title,text,Title,First Line,Author,Composition Date,Composition Location,Source of First Publication,First Publication Publisher,First Publication Year,First Publication Location
W#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
243,Георгиевские кресты\n \n ...,"[Каждый достоин медали, Каждый д...",Georgievskie kresty,Kazhdyi dostoin medali…,V. Gret,,Vovcha Budka,Gran',Tipografiia T. V. Shulikovoi,1918,
430,Зарождение,"[Рождалась в бурю в тяжких муках, ...",Zarozhdenie,Rozhdalas' v buriu v tiazhkikh mukakh…,E. G. Sokol,1917 to 1918,,Krasnye nabaty: Stikhotvoreniia (N. Leninu),Tret'ia tipografiia Gubsovnarkhoza,1919,Orel
212,Мы победим,"[Мы победим, клокочет сила В нас...",My pobedim,"My pobedim, klokochet sila v nas…",Mikhail Prokof'evich Gerasimov,,,Zheleznye tsvety: Stikhi,Izdatel'stvo Tsentropechat',1919,
431,Марсово\n поле,"[Милый, мой!... Милый, где ты? Н...",Marsovo pole,"Milyi moi!... Milyi, gde ty…",E. G. Sokol,1917,,Pesni o svobode,Svobodnyi narod,1917,Moscow
518,Огни,[Закружит ли ночная метельица И ...,Ogni,Zakruzhit li nochnaia metelitsa…,Georgii Andreevich Viatkin,date unknown,,Ranenaia Rossiia: Stikhi; Vernost': rasskaz; E...,Tipografiia Vremennogo Tsentral’nogo Voenno-P...,1919,Ekaterinburg


In [303]:
textsDf = WsDf[['text']]
textsDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))
textsDf.sort_values('num_words', ascending=False).head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textsDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))


Unnamed: 0_level_0,text,num_words
W#,Unnamed: 1_level_1,Unnamed: 2_level_1
115,[Каждым рявканьем пушечным ...,1108
109,"[Посвящаю эту повесть двухлетнему Грише, ...",1052
440,"[Как ветер с разбегу парус полощет, ...",826
135,"[Защищал вход в Сербию, словно стая львов, ...",814
253,"[Sic itur ad astra! …, Человек есть мера всех ...",746
174,"[Где втоптаны следы Батыя, ...",725
402,"[Прошу простить меня за слог, Но...",704
194,"[Не устрашуся гибели, Ни коп...",669
106,[Как у деда у Нефеда ...,637
301,"[По рождению Пречистого Спаса, В...",616


In [94]:
AsDf = pd.read_json('AsDf.json')
AsDf.index.name = 'A#'
AsDf

Unnamed: 0_level_0,name,birth,death,Type of Agent,Sex,Occupations,Family Social Strata,Literary Affiliations,Political Affiliations,Type of Corporate Body,Affiliation
A#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Anna Akhmatova,"June 23, 1889","March 5, 1966",person,female,poet,nobility,Acmeism,independent,,
2,Vasilii Dmitrievich Aleksandrovskii,"January 15, 1897","November 16, 1934",person,male,soldier,peasant,Kuznitsa,Bolshevik member,,
3,Ivan Nikolaevich Antonov,1878,1936?,person,male,editor,unknown,unknown,independent,,
4,Mikhail Dmitrievich Artamonov,"February 22, 1888","November 22, 1958",person,male,journalist,peasant,Vologda poets,unknown,,
5,Nikolai Aseev,"July 10, 1889","July 16, 1963",person,male,soldier,nobility,Left Front of Art: LEF,Bolshevik member,,
...,...,...,...,...,...,...,...,...,...,...,...
299,Mikhail Ivanovich Kalinin,"November 19, 1875","June 3, 1946",,,,,,,,
300,Moisei Solomonovich Uritskii,"January 14, 1873","August 30, 1918",,,,,,,,
301,Maximilien Marie Isidore de Robespierre,"May 6, 1758","June 28, 1794",,,,,,,,
302,Iurii Mikhailovich Steklov,"August 27, 1873","July 15, 1941",,,,,,,,


## Dataframe Split

### libDf

In [None]:
libDf = pd.DataFrame.from_dict(XMLdict, orient='index').rename_axis('work_num').sort_index(inplace=False)
libDf.index = libDf.index.str.lstrip('0')
libDf

### authorsDf

In [None]:
authorsDf = libDf.reset_index().groupby('author').size().to_frame().rename(columns={0:'num_works'})
authorsDf[['num_lps','num_words']] = libDf.reset_index().groupby('author').sum()
authorsDf.sort_values(by=['num_words','num_works'], ascending=False)
authorsDf

### worksDf

In [None]:
worksDf = libDf[['title','year','author','genre','num_lps','num_words']]
worksDf

### tokenDf

In [None]:
lpDf = libDf[['text']]
lpDf = lpDf.text.apply(lambda x: pd.Series([y for y in x])).stack().to_frame().rename(columns={0:'lp_str'})
lpDf.index.names = OHCO[:2]
lpDf
tokenDf = lpDf.lp_str.apply(lambda x: tokenize(x)).to_frame()#.rename(columns={0:'token'})
#tokenDf = lpDf.lp_str.apply(lambda x: y.text for y in tokenize(x)[1])
tokenDf

In [None]:
for i in tokenize(lpDf.lp_str): 
    print(i)