# MAPRR Textual Analytics

## Intro

### Import

In [1]:
import os 
import time
import logging
import random
import pandas as pd 
import re
import requests 
from bs4 import BeautifulSoup
from natasha import (
    Segmenter, 
    MorphVocab, 
    NewsEmbedding, 
    NewsMorphTagger, 
    NewsSyntaxParser, 
    NewsNERTagger, 
    PER, 
    NamesExtractor, 
    Doc)
from razdel import tokenize

In [2]:
logging.basicConfig(filename='maprr_out.log', encoding='utf-8', format='%(asctime)s %(message)s', level=logging.INFO)

In [4]:
lib_cols = ['title_ru', 'text', 'title_en', '1st_line', 'author', 'comp_date', 'comp_loc', 'pub_src', '1st_pub', 'pub_year', 'pub_loc']
a_cols = ['name', 'birth', 'death', 'a_type', 'sex', 'occs', 'fam_soc_str', 'lit_affil', 'pol_affil', 'corp_type', 'corp_affil']

## Pre-processing Function

In [3]:
domain = 'https://mpgrr.herokuapp.com/'

In [5]:
tables = {'agents/': 304,
          'works/': 603, 
          'place_based_concepts/': 315, 
          'locations/': 366, 
          'multivalent_markers/': 433}

In [13]:
class maprr: 
    
    def __init__(self): 
        self.Wsoup = {} 
        self.Asoup = {}
        self.Ws = {}
        self.As = {}
    
    def get_htmlA(self): 
        aberrantAs = []
        for i in range(1, (list(tables.values())[0]+1)):
            url = domain+list(tables.keys())[0]+str(i)
            with requests.get(url) as r: 
                logging.info(f"A{i} status code: {r.status_code}")
                if r.status_code == 200: 
                    s = BeautifulSoup(r.content, 'html.parser')
                    self.Asoup.update({i:s})
                else: 
                    aberrantAs.append(i)
                    pass
                time.sleep(.1)
        if len(aberrantAs) > 0: 
            print(f"Aberrant author pages are #s {aberrantAs}")
        else: 
            print(f"There were no aberrant author pages!")

    def get_htmlW(self): 
        aberrantWs = []
        for i in range(1, (list(tables.values())[1]+1)):
            url = domain+list(tables.keys())[1]+str(i)
            with requests.get(url) as r: 
                logging.info(f"W{i} status code: {r.status_code}")
                if r.status_code == 200: 
                    s = BeautifulSoup(r.content, 'html.parser')
                    self.Wsoup.update({i:s})
                else: 
                    aberrantWs.append(i)
                    pass
                time.sleep(.1)
        if len(aberrantWs) > 0: 
            print(f"Aberrant work pages are #s {aberrantWs}")
        else: 
            print(f"There were no aberrant work pages!")

    def parseWs(self, html): 
        content = html.find('div', {'class':'col-md-9 fixed-height'})
        try: 
            author = content.div.h3.text
        except: 
            author = "unknown"
        try: 
            title = content.div.h4.text
        except: 
            title = "untitled"
        stanza_text = content.find_all('p',{'class':'stanza'})
        prose_text = content.find_all('p',{'class':'text'})
        if len(stanza_text) < len(prose_text): 
            text = prose_text
        else: 
            text = stanza_text
        Wtext = [x.text.replace('\n','').strip() for x in text]
        metaKeys = [x.text[:-1] for x in html.find('div', {'class':'card-body'}).find_all('h4')]
        metaVals = [x.text for x in html.find('div', {'class':'card-body'}).find_all('p')]
        metaDict = dict(zip(metaKeys, metaVals))
        subDict = {'title': title, 
                   'text': Wtext}
        subDict.update(metaDict)
        return subDict

    def parseAs(self, html): 
        name = html.find('div', {'class': 'card scrollable'}).h2.text
        bdate, ddate = html.find('div', {'class': 'card scrollable'}).span.text.split(' - ')
        subDict = {'name': name, 'birth': bdate, 'death': ddate}
        typeKeys = [x.h4.text.lower().replace(' ','_') for x in html.find_all('div', {'class': 'col-md-4'})]
        typeVals = []
        for typ in html.find_all('div', {'class': 'col-md-4'}): 
            try: 
                typeVals.append(typ.p or typ.div.span.text)
            except: 
                typeVals.append("unknown")
        typeVals = [x.text if not isinstance(x, str) else x for x in typeVals]
        typeDict = dict(zip(typeKeys, typeVals))
        subDict.update(typeDict)
        return subDict
    
    def get_single(self, cat, id_num, df): 
        with requests.get(url) as r: 
            logging.info(f"{cat+str(id_num)} status code: {r.status_code}")
            if r.status_code == 200: 
                s = BeautifulSoup(r.content, 'html.parser')
        if cat == 'work': 
            newSubDict = {id_num:self.parseWs(s)}
        elif cat == 'agent': 
            newSubDict = {id_num:self.parseAs(s)} 
        else: 
            print("You need a category: 'work' or 'agent'...")
        
        df = df.append(newSubDict) 
        return df


    def run(self): 
        logging.info(f"Getting As and Ws")
        print(f"Getting As")
        at1 = time.time()
        self.get_htmlA() 
        at2 = time.time()
        print(f"Got As in {at2-at1} seconds (that's {(at2-at1)/list(tables.values())[0]} sec/ea.)")
        print(f"Getting Ws")
        wt1 = time.time()
        self.get_htmlW() 
        wt2 = time.time()
        print(f"Got Ws in {wt2-wt1} seconds (that's {(wt2-wt1)/list(tables.values())[1]} sec/ea.)")
        logging.info(f"Done getting As and Ws")
        
        logging.info(f"Parsing As and Ws")
        print(f"Parsing As")
        pa1 = time.time()
        self.As = {k: self.parseAs(v) for k, v in self.Asoup.items()}
        pa2 = time.time()
        print(f"Parsed As in {pa2-pa1} seconds (that's {(pa2-pa1)/len(list(self.Asoup.items()))} sec/ea.)")
        print(f"Parsing Ws")
        pw1 = time.time()
        self.Ws = {k: self.parseWs(v) for k, v in self.Wsoup.items()}
        pw2 = time.time() 
        print(f"Parsed Ws in {pw2-pw1} seconds (that's {(p22-p21)/len(list(self.Wsoup.items()))} sec/ea.)")
        logging.info(f"Done parsing As and Ws")
        
        logging.info(f"Making dataframes")
        print(f"Making AsDf")
        AsDf = pd.DataFrame.from_dict(self.As, orient='index')
        print(f"Making WsDf")
        WsDf = pd.DataFrame.from_dict(self.Ws, orient='index')        
        logging.info(f"Done making dataframes")
        
        logging.info(f"Writing to json")
        WsDf.to_json('WsDf.json')
        AsDf.to_json('AsDf.json')
        logging.info(f"Done writing to json")

In [None]:
%%time
if __name__ == '__main__': 
    maprr().run()

In [8]:
urls_to_visit = [] 
for t, i in list(tables.items())[:2]: 
    for j in range(0,i+1): 
        urls_to_visit.append(domain+t+str(j))

In [98]:
AsDf = pd.read_json('AsDf.json')
AsDf.index.name = 'a_id'
AsDf.columns = a_cols
AsDf['birth'] = pd.to_datetime(AsDf['birth'], errors='coerce', infer_datetime_format=True)
AsDf['death'] = pd.to_datetime(AsDf['death'], errors='coerce', infer_datetime_format=True)
AsDf

Unnamed: 0_level_0,name,birth,death,a_type,sex,occs,fam_soc_str,lit_affil,pol_affil,corp_type,corp_affil
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Anna Akhmatova,1889-06-23,1966-03-05,person,female,poet,nobility,Acmeism,independent,,
2,Vasilii Dmitrievich Aleksandrovskii,1897-01-15,1934-11-16,person,male,soldier,peasant,Kuznitsa,Bolshevik member,,
3,Ivan Nikolaevich Antonov,NaT,NaT,person,male,editor,unknown,unknown,independent,,
4,Mikhail Dmitrievich Artamonov,1888-02-22,1958-11-22,person,male,journalist,peasant,Vologda poets,unknown,,
5,Nikolai Aseev,1889-07-10,1963-07-16,person,male,soldier,nobility,Left Front of Art: LEF,Bolshevik member,,
...,...,...,...,...,...,...,...,...,...,...,...
300,Moisei Solomonovich Uritskii,1873-01-14,1918-08-30,person,male,revolutionary,merchant,unknown,Bolshevik member,,
301,Maximilien Marie Isidore de Robespierre,1758-05-06,1794-06-28,person,male,unknown,professional,unknown,unknown,,
302,Iurii Mikhailovich Steklov,1873-08-27,1941-07-15,person,male,revolutionary,unknown,unknown,Socialist Revolutionary,,
303,Christian August Friedrich Peters,1806-09-07,1880-05-08,person,male,unknown,unknown,unknown,unknown,,


## Dataframe Split

### libDf

In [93]:
libDf = pd.read_json('WsDf.json')
libDf.index.name = 'w_id'
libDf.columns = lib_cols
print(libDf.shape)
libDf

(586, 11)


Unnamed: 0_level_0,title_ru,text,title_en,1st_line,author,comp_date,comp_loc,pub_src,1st_pub,pub_year,pub_loc
w_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Untitled,"[Сразу стало тихо в доме, Облет...",no title,Srazu stalo tikho v dome…,Anna Akhmatova,July 1917,Slepnevo,Podorozhnik,Petropolis,1921,
2,Untitled,[Ты — отступник: за остров зелёный ...,no title,Ty - otstupnik: za ostrov zelenyi…,Anna Akhmatova,summer 1917,Slepnevo,Podorozhnik,Petropolis,1921,
3,Untitled,[Просыпаться на рассвете Оттого...,no title,Prosypat'sia na rassvete…,Anna Akhmatova,July 1917,Slepnevo,Podorozhnik,Petropolis,1921,
4,Untitled,"[И в тайную дружбу с высоким, К...",no title,I v tainuiu druzhbu c vysokim…,Anna Akhmatova,1917,Petrograd,Podorozhnik,Petropolis,1921,
5,Untitled,"[Словно ангел, возмутивший воду, ...",no title,"Slovno angel, vozmutivshii vodu…",Anna Akhmatova,February 1916,Tsarskoe selo,Podorozhnik,Petropolis,1921,
...,...,...,...,...,...,...,...,...,...,...,...
599,Untitled,"[Любовь распяли на кресте, Но в ...","""Liubov' raspiali na kreste""",Liubov' raspiali na kreste,Georgii Andreevich Viatkin,,Omsk,Ranenaia Rossiia: Stikhi; Vernost': rasskaz; E...,Tipografiia Vremennogo Tsentral’nogo Voenno-P...,1919,Ekaterinburg
600,На словах...,"[На словах - все согласны, что российское госу...",Na slovakh... (Nesvoevremennye mysli),Na slovakh--vse soglasny...,Maksim Gor'kii,"June 29, 1917",Petrograd,Novaia zhizn',A. N. Tikhonov,"June 29, 1917",Petrograd
601,Последняя просьба,"[Сестра!.. Сестрица, на минутку подойдите ...",Posledniaia pros'ba,"Sestra! Sestritsa, na minutku podoidite…",M Kolchin,date unknown,,Pesni voiny: posviashchaetsia doblestnym sibir...,Tipografiia I. M. Poznera,1915,
602,И рек Сидящий на престоле,"[В борьбе с врагом, в борьбе кровавой, геройск...",I rek Sidiashchii na prestole,"V bor'be s vragom, v bor'be krovavom...",M. Did,1914 to 1915,,"Nabat: Stikhotvoreniia, 1914-1915",Tipografiia N. A. Vorob'eva,1916,


In [71]:
textsDf = libDf[['text']]
textsDf['num_words'] = libDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))
textsDf.sort_values('num_words', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textsDf['num_words'] = libDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))


Unnamed: 0_level_0,text,num_words
w_id,Unnamed: 1_level_1,Unnamed: 2_level_1
140,[в которой благосклонный читатель знакомится с...,4591
403,"[Случалось, на огонек во время перелета, или в...",3067
551,[Так как Волга была великой исторической дорог...,1616
61,"[Вы, которым шестьдесят лет, или даже вы, кото...",1570
142,[Волга! Одна из значительнейших рек всего земн...,1234
...,...,...
482,"[Под рокот гражданских бурь, В л...",14
479,[Белогвардейцы! Гордиев узел ...,14
400,[Пусть стихи мои развеют Ва...,13
464,"[...Сын казака, казак... Так нач...",12


### authorsDf

In [91]:
authorsDf = libDf.reset_index().groupby('author').size().to_frame().rename(columns={0:'num_works'})
authorsDf['num_words'] = libDf.reset_index().groupby('author').sum().num_words
authorsDf['avg_wpw'] = round(authorsDf.num_words/authorsDf.num_works, 2)
authorsDf = authorsDf.reset_index().sort_values(by=['avg_wpw'], ascending=False).rename(columns={'author':'name'})
authorsDf = pd.merge(AsDf.reset_index(), authorsDf.reset_index(), how='right', on='name').drop(columns=['index'])#.set_index('a_id')
authorsDf.at[39,'a_id'] = 45
authorsDf.a_id = authorsDf.a_id.astype(int)
authorsDf = authorsDf.set_index('a_id')
#authorsDf.index.astype(int)
authorsDf.to_json('authorsDf.json')
authorsDf
#authorsDf[authorsDf.name.str.contains('Osip')]

Unnamed: 0_level_0,name,birth,death,type_of_agent,sex,occupations,family_social_strata,literary_affiliations,political_affiliations,type_of_corporate_body,affiliation,num_works,num_words,avg_wpw
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
16,Aleksandr Vasil'evich Chaianov,1888-01-29,1937-10-03,person,male,writer,merchant,unknown,agricultural cooperativist,,,1,4591,4591.00
57,Mikhail Mikhailovich Prishvin,1873-02-04,1954-01-16,person,male,writer,merchant,unknown,nationalist,,,1,3067,3067.00
40,Aleksandra Kollontai,1872-03-31,1952-03-09,person,female,activist,nobility,unknown,Menshevik,,,1,1082,1082.00
65,S. D. Spasskii,1898-12-21,1956-08-24,person,male,writer,professional,Futurism,independent,,,1,826,826.00
17,Evgenii Nikolaevich Chirikov,1864-08-05,1932-01-18,person,male,writer,nobility,Gor'kii circle,Populist,,,4,3064,766.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,Mikhail Dmitrievich Artamonov,1888-02-22,1958-11-22,person,male,journalist,peasant,Vologda poets,unknown,,,9,387,43.00
66,Sergei Stradnyi,NaT,NaT,person,male,poet,peasant,Smolensk Proletkult,Proletkult Bolshevik,,,7,300,42.86
19,Pavel Leonidovich Daletskii,1898-02-02,1963-07-08,person,male,poet,professional,unknown,unknown,,,3,124,41.33
70,Boris Virganskii,NaT,NaT,person,male,poet,unknown,unknown,Bolshevik-leaning,,,6,207,34.50


### worksDf

In [None]:
worksDf = libDf[['title','year','author','genre','num_lps','num_words']]
worksDf

### tokenDf

In [31]:
lpDf = libDf[['text']]
lpDf = lpDf.text.apply(lambda x: pd.Series([y for y in x])).stack().to_frame().rename(columns={0:'lp_str'})
lpDf.index.names = OHCO[:2]
lpDf
tokenDf = lpDf.lp_str.apply(lambda x: tokenize(x)).to_frame()#.rename(columns={0:'token'})
#tokenDf = lpDf.lp_str.apply(lambda x: y.text for y in tokenize(x)[1])
tokenDf

  lpDf = lpDf.text.apply(lambda x: pd.Series([y for y in x])).stack().to_frame().rename(columns={0:'lp_str'})


NameError: name 'OHCO' is not defined

In [None]:
for i in tokenize(lpDf.lp_str): 
    print(i)