# MAPRR Textual Analytics

## Intro

### Import

In [29]:
import os 
import time
import logging
import random
import pandas as pd 
import re
import threading
import requests
from bs4 import BeautifulSoup
from natasha import (
    Segmenter, 
    MorphVocab, 
    NewsEmbedding, 
    NewsMorphTagger, 
    NewsSyntaxParser, 
    NewsNERTagger, 
    PER, 
    NamesExtractor, 
    Doc)
from razdel import tokenize

In [2]:
logging.basicConfig(filename='mappr_out.log', encoding='utf-8', format='%(asctime)s %(message)s', level=logging.INFO)

## Pre-processing Function

In [3]:
domain = 'https://mpgrr.herokuapp.com/'

In [4]:
tables = {'agents/': 304,
          'works/': 603, 
          'place_based_concepts/': 315, 
          'locations/': 366, 
          'multivalent_markers/': 433}

In [5]:
class maprr: 
    
    def __init__(self): 
        self.Wsoup = {} 
        self.Asoup = {}
        self.Ws = {}
        self.As = {}
    
    def get_htmlA(self): 
        aberrantAs = []
        for i in range(1, list(tables.values())[0]):
            url = domain+list(tables.keys())[0]+str(i)
            with requests.get(url) as r: 
                logging.info(f"A{i} status code: {r.status_code}")
                if r.status_code == 200: 
                    s = BeautifulSoup(r.content, 'html.parser')
                    self.Asoup.update({i:s})
                else: 
                    aberrantAs.append(i)
                    pass
                time.sleep(.1)
        print(f"Aberrant author pages are #s {aberrantAs}")

    def get_htmlW(self): 
        aberrantWs = []
        for i in range(1, list(tables.values())[1]):
            url = domain+list(tables.keys())[1]+str(i)
            with requests.get(url) as r: 
                logging.info(f"W{i} status code: {r.status_code}")
                if r.status_code == 200: 
                    s = BeautifulSoup(r.content, 'html.parser')
                    self.Wsoup.update({i:s})
                else: 
                    aberrantWs.append(i)
                    pass
                time.sleep(.1)
        print(f"Aberrant work pages are #s {aberrantWs}")

    def parseWs(self, html): 
        content = html.find('div', {'class':'col-md-9 fixed-height'})
        try: 
            author = content.div.h3.text
        except: 
            author = ""
        try: 
            title = content.div.h4.text
        except: 
            title = ""
        stanza_text = content.find_all('p',{'class':'stanza'})
        prose_text = content.find_all('p',{'class':'text'})
        if len(stanza_text) > len(prose_text): 
            text = stanza_text
        elif len(stanza_text) < len(prose_text): 
            text = prose_text
        Wtext = [x.text.replace('\n','').strip() for x in text]
        metaKeys = [x.text[:-1] for x in html.find('div', {'class':'card-body'}).find_all('h4')]
        metaVals = [x.text for x in html.find('div', {'class':'card-body'}).find_all('p')]
        metaDict = dict(zip(metaKeys, metaVals))
        subDict = {'title': title, 
                   'text': Wtext}
        subDict.update(metaDict)
        #self.Ws.update(subdict)
        return subDict

    def parseAs(self, html): 
        name = html.find('div', {'class': 'card scrollable'}).h2.text
        bdate, ddate = html.find('div', {'class': 'card scrollable'}).span.text.split(' - ')
        subDict = {'name': name, 'birth': bdate, 'death': ddate}
        try: 
            typeKeys = [x.h4.text for x in html.find_all('div', {'class': 'col-md-4'})]
            typeVals = [x.p or x.div.span.text for x in html.find_all('div', {'class': 'col-md-4'})]
            typeVals[:2] = [x.text for x in typeVals[:2]]
            typeDict = dict(zip(typeKeys, typeVals))
            subDict.update(typeDict)
        except: 
            pass
        #self.As.update(subdict)
        return subDict

    def run(self): 
        logging.info(f"Getting As and Ws")
        print(f"Getting As")
        self.get_htmlA() 
        print(f"Getting Ws")
        self.get_htmlW() 
        logging.info(f"Done getting As and Ws")
        
        logging.info(f"Parsing As and Ws")
        print(f"Parsing As")
        self.As = {k: self.parseAs(v) for k, v in self.Asoup.items()}
        print(f"Parsing Ws")
        self.Ws = {k: self.parseWs(v) for k, v in self.Wsoup.items()}
        logging.info(f"Done parsing As and Ws")
        
        logging.info(f"Making dataframes")
        print(f"Making AsDf")
        AsDf = pd.DataFrame.from_dict(self.As, orient='index')
        print(f"Making WsDf")
        WsDf = pd.DataFrame.from_dict(self.Ws, orient='index')        
        logging.info(f"Done making dataframes")
        
        logging.info(f"Writing to json")
        WsDf.to_json('WsDf.json')
        AsDf.to_json('AsDf.json')
        logging.info(f"Done writing to json")

In [48]:
class ParallelMAPRR: 
    
    global domain, tables
    
    def __init__(self): 
        self.urls_to_visit = []
        self.aberrantAs = []
        self.aberrantWs = []
        self.Wsoup = {} 
        self.Asoup = {}
        self.Ws = {}
        self.As = {}
    
    def get_html(self, url): 
        url_format = 'https://mpgrr.herokuapp.com/(\w+)/(\d{1,3})'
        url_match = re.match(url_format, url)
        fco_type = url_match.group(1)
        id_num = url_match.group(2)
        with requests.get(url) as r: 
            logging.info(f"{fco_type}/{i} status code: {r.status_code}")
            if r.status_code == 200: 
                s = BeautifulSoup(r.content, 'html.parser')
                if fco_type == 'agents': 
                    self.Asoup.update({id_num:s})
                elif fco_type == 'works': 
                    self.Wsoup.update({id_num:s})
            else: 
                if fco_type == 'agents': 
                    self.aberrantAs.append({'A'+str(id_num): r.status_code})
                elif fco_type == 'works': 
                    self.aberrantWs.append({'W'+str(id_num): r.status_code})
                pass

    def parseWs(self, html): 
        content = html.find('div', {'class':'col-md-9 fixed-height'})
        try: 
            author = content.div.h3.text
        except: 
            author = ""
        try: 
            title = content.div.h4.text
        except: 
            title = ""
        stanza_text = content.find_all('p',{'class':'stanza'})
        prose_text = content.find_all('p',{'class':'text'})
        if len(stanza_text) > len(prose_text): 
            text = stanza_text
        elif len(stanza_text) < len(prose_text): 
            text = prose_text
        Wtext = [x.text.replace('\n','').strip() for x in text]
        metaKeys = [x.text[:-1] for x in html.find('div', {'class':'card-body'}).find_all('h4')]
        metaVals = [x.text for x in html.find('div', {'class':'card-body'}).find_all('p')]
        metaDict = dict(zip(metaKeys, metaVals))
        subDict = {'title': title, 
                   'text': Wtext}
        
        self.Ws.update(subdict)
        #return subDict

    def parseAs(self, html): 
        name = html.find('div', {'class': 'card scrollable'}).h2.text
        bdate, ddate = html.find('div', {'class': 'card scrollable'}).span.text.split(' - ')
        subDict = {'name': name, 'birth': bdate, 'death': ddate}
        try: 
            typeKeys = [x.h4.text for x in html.find_all('div', {'class': 'col-md-4'})]
            typeVals = [x.p or x.div.span.text for x in html.find_all('div', {'class': 'col-md-4'})]
            typeVals[:2] = [x.text for x in typeVals[:2]]
            typeDict = dict(zip(typeKeys, typeVals))
            subDict.update(typeDict)
        except: 
            pass
        
        self.As.update(subdict)
        #return subDict
    
    def sort_parsing(html): 
        if

    def run(self): 
        for t, i in list(tables.items())[:2]: 
            for j in range(0,i+1): 
                self.urls_to_visit.append(domain+t+str(j))
        
        logging.info(f"Getting As and Ws")
        print(f"Getting As")
        while self.urls_to_visit: 
            this_url = self.urls_to_visit.pop(0)
            self.get_html(this_url) 
        logging.info(f"Done getting As and Ws")
        
        logging.info(f"Parsing As and Ws")
        print(f"Parsing As")
        while self.Asoup: 
            this_soup = self.Asoup.pop(0)
            logging.info(f"Parsing article #{this_soup}") 
            parseAs(this_soup)
        print(f"Parsing Ws")
        self.Ws = {k: self.parseWs(v) for k, v in self.Wsoup.items()}
        logging.info(f"Done parsing As and Ws")
        
        logging.info(f"Making dataframes")
        print(f"Making AsDf")
        AsDf = pd.DataFrame.from_dict(self.As, orient='index')
        print(f"Making WsDf")
        WsDf = pd.DataFrame.from_dict(self.Ws, orient='index')        
        logging.info(f"Done making dataframes")
        
        logging.info(f"Writing to json")
        WsDf.to_json('WsDf.json')
        AsDf.to_json('AsDf.json')
        logging.info(f"Done writing to json")

SyntaxError: cannot assign to function call (<ipython-input-48-c83b57a74411>, line 77)

In [65]:
urls_to_visit = [] 
for t, i in list(tables.items())[:2]: 
    for j in range(0,i+1): 
        urls_to_visit.append(domain+t+str(j))
print(urls_to_visit)

['https://mpgrr.herokuapp.com/agents/0', 'https://mpgrr.herokuapp.com/agents/1', 'https://mpgrr.herokuapp.com/agents/2', 'https://mpgrr.herokuapp.com/agents/3', 'https://mpgrr.herokuapp.com/agents/4', 'https://mpgrr.herokuapp.com/agents/5', 'https://mpgrr.herokuapp.com/agents/6', 'https://mpgrr.herokuapp.com/agents/7', 'https://mpgrr.herokuapp.com/agents/8', 'https://mpgrr.herokuapp.com/agents/9', 'https://mpgrr.herokuapp.com/agents/10', 'https://mpgrr.herokuapp.com/agents/11', 'https://mpgrr.herokuapp.com/agents/12', 'https://mpgrr.herokuapp.com/agents/13', 'https://mpgrr.herokuapp.com/agents/14', 'https://mpgrr.herokuapp.com/agents/15', 'https://mpgrr.herokuapp.com/agents/16', 'https://mpgrr.herokuapp.com/agents/17', 'https://mpgrr.herokuapp.com/agents/18', 'https://mpgrr.herokuapp.com/agents/19', 'https://mpgrr.herokuapp.com/agents/20', 'https://mpgrr.herokuapp.com/agents/21', 'https://mpgrr.herokuapp.com/agents/22', 'https://mpgrr.herokuapp.com/agents/23', 'https://mpgrr.herokuapp.

In [6]:
%%time
if __name__ == '__main__': 
    maprr().run()

Getting As
Aberrant author pages are #s [45, 74, 75, 76, 77, 139, 140, 192, 206, 252, 271, 288, 290, 295]
Getting Ws
Aberrant work pages are #s [173, 179, 261, 306, 425, 433, 434, 435, 439, 526, 554, 576, 577, 578, 579, 581, 598]
Parsing As
Parsing Ws
Making AsDf
Making WsDf
CPU times: user 1min 21s, sys: 2 s, total: 1min 23s
Wall time: 1h 43min 57s


In [28]:
list(tables.keys())[1]

'works/'

In [46]:
with requests.get(domain+list(tables.keys())[1]+str(random.randint(0,list(tables.values())[1]))) as r: 
    print(r.url)
    if r.status_code == 200: 
        s = BeautifulSoup(r.content, 'html.parser')
        content = s.find('div', {'class':'col-md-9 fixed-height'})
        try: 
            author = content.div.h3.text
        except: 
            author = ""
        try: 
            title = content.div.h4.text
        except: 
            title = ""
        stanza_text = content.find_all('p',{'class':'stanza'})
        prose_text = content.find_all('p',{'class':'text'})
        if len(stanza_text) > len(prose_text): 
            text = stanza_text
        elif len(stanza_text) < len(prose_text): 
            text = prose_text
        Wtext = [x.text.replace('\n','').strip() for x in text]
        metaKeys = [x.text[:-1] for x in s.find('div', {'class':'card-body'}).find_all('h4')]
        metaVals = [x.text for x in s.find('div', {'class':'card-body'}).find_all('p')]
        metaDict = dict(zip(metaKeys, metaVals))
        subDict = {'title': title, 
                   'text': Wtext}
        subDict.update(metaDict)
        print(subDict)

https://mpgrr.herokuapp.com/works/412
{'title': 'Любопытство\n               Эклерезиты', 'text': ['—Мама, милая мамочка,               Скоро-ль будет война?               —что с тобой, моя девочка?               Может быть, ты больна?', '—Все соседи сражаются,               Не воюем лишь мы.               —Но у нас, слава Господу,               Все здоровы умы.', '—Почему нас не трогают?               Не пленят почему?               —Потому что Миррэлия               Не видна никому…', '—Почему ж наша родина               Никому не видна?               —Потому что вселенная               Нам с тобой не нужна…', '—Мама, милая мамочка,               Плачет сердце мое…               —Различай, моя девочка,               От чужого — свое…', '—Ну, а что окружает нас?               Кто ближайший сосед?               —Кроме звезд и Миррэлий               Ничего в мире нет!'], 'Title': 'Liubopytstvo Eklerezity', 'First Line': 'Mama, milaia mamochka…', 'Author': 'Igor Severianin', 'Composition

In [42]:
len([x.text.replace('\n','').strip() for x in content.find_all('p', {'class':'stanza'})])

0

In [43]:
len([x.text.replace('\n','').strip() for x in content.find_all('p', {'class':'text'})])

30

In [11]:
libDf = pd.read_json('WsDf.json')
libDf.index.name = 'W#'
print(libDf.shape)
libDf.sample(5)

(585, 11)


Unnamed: 0_level_0,title,text,Title,First Line,Author,Composition Date,Composition Location,Source of First Publication,First Publication Publisher,First Publication Year,First Publication Location
W#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
143,Вдруг вскричать,"[Вдруг вскричать, что нагим колосом ...",Vdrug vskrichat',"Vdrug vskrichat', chto nagim kolosom…",Pavel Leonidovich Daletskii,1920,,Belyi khram: Stikhi: (Edistvennaia kniga),Svobodnaia Rossiia,1921,
222,Революция и культура,[],Revoliutsiiia i kul'tura,Esli okinut' odnim vzgliadom…,Maksim Gor'kii,,Petrograd,Novaia zhizn',A. N. Tikhonov,"May 1, 1917",Petrograd
391,Грозная тень,"[Война окончена, но тень Ее виси...",Groznaia ten',"Voina okonchena, no ten'…",Pëtr Vasil'evich Oreshin,date unknown,,My: Stikhi,Gubizdat,1921,
293,Родным песням,"[Что это за звуки глухо раздаются, ...",Rodnym pesniam,Chto eto za zvuki glukho razdaiutsia…,Kondratii Kuz'mich Khudiakov,date unknown,,Sibir': stikhi,Tipografiia Narodnoi Gazety,1916,Kurgan
361,Русь,"[Даль родная, даль дремучая, ...",Rus',"Dal' rodnaia, dal' dremuchaia…",Ivan Ignat'evich Morozov,date unknown,,Krasnyi zvon: Stikhotvoreniia,Tipografiia S. A. Alianchikova,1916,


In [16]:
textsDf = libDf[['text']]
textsDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))
textsDf#.sort_values('num_words', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textsDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))


Unnamed: 0_level_0,text,num_words
W#,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[Сразу стало тихо в доме, Облет...",34
2,[Ты — отступник: за остров зелёный ...,75
3,[Просыпаться на рассвете Оттого...,40
4,"[И в тайную дружбу с высоким, К...",23
5,"[Словно ангел, возмутивший воду, ...",29
...,...,...
597,[],0
599,"[Любовь распяли на кресте, Но в ...",42
600,[],0
601,"[Сестра!.. Сестрица, на минутку подойдите ...",68


In [15]:
libDf

Unnamed: 0_level_0,title,text,Title,First Line,Author,Composition Date,Composition Location,Source of First Publication,First Publication Publisher,First Publication Year,First Publication Location,num_words
W#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Untitled,"[Сразу стало тихо в доме, Облет...",no title,Srazu stalo tikho v dome…,Anna Akhmatova,July 1917,Slepnevo,Podorozhnik,Petropolis,1921,,34
2,Untitled,[Ты — отступник: за остров зелёный ...,no title,Ty - otstupnik: za ostrov zelenyi…,Anna Akhmatova,summer 1917,Slepnevo,Podorozhnik,Petropolis,1921,,75
3,Untitled,[Просыпаться на рассвете Оттого...,no title,Prosypat'sia na rassvete…,Anna Akhmatova,July 1917,Slepnevo,Podorozhnik,Petropolis,1921,,40
4,Untitled,"[И в тайную дружбу с высоким, К...",no title,I v tainuiu druzhbu c vysokim…,Anna Akhmatova,1917,Petrograd,Podorozhnik,Petropolis,1921,,23
5,Untitled,"[Словно ангел, возмутивший воду, ...",no title,"Slovno angel, vozmutivshii vodu…",Anna Akhmatova,February 1916,Tsarskoe selo,Podorozhnik,Petropolis,1921,,29
...,...,...,...,...,...,...,...,...,...,...,...,...
597,РЕЧЬ НА МОСКОВСКОМ ПУБЛИЧНОМ СОБРАНИИ ОБЩЕСТВА...,[],Nesvoevremennye mysli,Rech' na moskovskom publichnom sobranii,Maksim Gor'kii,"June 30, 1918",Petrograd,Novaia zhizn',A. N. Tikhonov,"June 30, 1918",Petrograd,0
599,Untitled,"[Любовь распяли на кресте, Но в ...","""Liubov' raspiali na kreste""",Liubov' raspiali na kreste,Georgii Andreevich Viatkin,,Omsk,Ranenaia Rossiia: Stikhi; Vernost': rasskaz; E...,Tipografiia Vremennogo Tsentral’nogo Voenno-P...,1919,Ekaterinburg,42
600,На словах...,[],Na slovakh... (Nesvoevremennye mysli),Na slovakh--vse soglasny...,Maksim Gor'kii,"June 29, 1917",Petrograd,Novaia zhizn',A. N. Tikhonov,"June 29, 1917",Petrograd,0
601,Последняя просьба,"[Сестра!.. Сестрица, на минутку подойдите ...",Posledniaia pros'ba,"Sestra! Sestritsa, na minutku podoidite…",M Kolchin,date unknown,,Pesni voiny: posviashchaetsia doblestnym sibir...,Tipografiia I. M. Poznera,1915,,68


In [9]:
AsDf = pd.read_json('AsDf.json')
AsDf.index.name = 'A#'
AsDf

Unnamed: 0_level_0,name,birth,death,Type of Agent,Sex,Occupations,Family Social Strata,Literary Affiliations,Political Affiliations,Type of Corporate Body,Affiliation
A#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Anna Akhmatova,"June 23, 1889","March 5, 1966",person,female,poet,nobility,Acmeism,independent,,
2,Vasilii Dmitrievich Aleksandrovskii,"January 15, 1897","November 16, 1934",person,male,soldier,peasant,Kuznitsa,Bolshevik member,,
3,Ivan Nikolaevich Antonov,1878,1936?,person,male,editor,unknown,unknown,independent,,
4,Mikhail Dmitrievich Artamonov,"February 22, 1888","November 22, 1958",person,male,journalist,peasant,Vologda poets,unknown,,
5,Nikolai Aseev,"July 10, 1889","July 16, 1963",person,male,soldier,nobility,Left Front of Art: LEF,Bolshevik member,,
...,...,...,...,...,...,...,...,...,...,...,...
299,Mikhail Ivanovich Kalinin,"November 19, 1875","June 3, 1946",,,,,,,,
300,Moisei Solomonovich Uritskii,"January 14, 1873","August 30, 1918",,,,,,,,
301,Maximilien Marie Isidore de Robespierre,"May 6, 1758","June 28, 1794",,,,,,,,
302,Iurii Mikhailovich Steklov,"August 27, 1873","July 15, 1941",,,,,,,,


## Dataframe Split

### libDf

In [10]:
libDf = pd.DataFrame.from_dict(XMLdict, orient='index').rename_axis('work_num').sort_index(inplace=False)
libDf.index = libDf.index.str.lstrip('0')
libDf

NameError: name 'XMLdict' is not defined

### authorsDf

In [None]:
authorsDf = libDf.reset_index().groupby('author').size().to_frame().rename(columns={0:'num_works'})
authorsDf[['num_lps','num_words']] = libDf.reset_index().groupby('author').sum()
authorsDf.sort_values(by=['num_words','num_works'], ascending=False)
authorsDf

### worksDf

In [None]:
worksDf = libDf[['title','year','author','genre','num_lps','num_words']]
worksDf

### tokenDf

In [None]:
lpDf = libDf[['text']]
lpDf = lpDf.text.apply(lambda x: pd.Series([y for y in x])).stack().to_frame().rename(columns={0:'lp_str'})
lpDf.index.names = OHCO[:2]
lpDf
tokenDf = lpDf.lp_str.apply(lambda x: tokenize(x)).to_frame()#.rename(columns={0:'token'})
#tokenDf = lpDf.lp_str.apply(lambda x: y.text for y in tokenize(x)[1])
tokenDf

In [None]:
for i in tokenize(lpDf.lp_str): 
    print(i)