# MAPRR Textual Analytics

## Intro

### Import

In [1]:
import os 
import time
import logging
import random
import pandas as pd 
import re
import requests 
from bs4 import BeautifulSoup
from natasha import (
    Segmenter, 
    MorphVocab, 
    NewsEmbedding, 
    NewsMorphTagger, 
    NewsSyntaxParser, 
    NewsNERTagger, 
    PER, 
    NamesExtractor, 
    Doc)
from razdel import tokenize

In [2]:
logging.basicConfig(filename='maprr_out.log', encoding='utf-8', format='%(asctime)s %(message)s', level=logging.INFO)

In [3]:
lib_cols = ['title_ru', 'genre', 'text', 'title_en', '1st_line', 'author', 'comp_date', 'comp_loc', 'pub_src', '1st_pub', 'pub_year', 'pub_loc']
a_cols = ['name', 'birth', 'death', 'a_type', 'sex', 'occs', 'fam_soc_str', 'lit_affil', 'pol_affil', 'corp_type', 'corp_affil']

## Pre-processing Function

In [4]:
domain = 'https://mpgrr.herokuapp.com/'
tables = {
    'agents/': 304, 
    'works/': 603, 
    'place_based_concepts/': 315, 
    'locations/': 366, 
    'multivalent_markers/': 433
}

In [5]:
allURLs = [domain+item[0]+str(i) for item in tables.items() for i in range(1, item[1]+1)]

In [6]:
urls_to_visit = [] 
for t, i in list(tables.items())[:2]: 
    for j in range(0,i+1): 
        urls_to_visit.append(domain+t+str(j))

In [9]:
class maprr: 
    
    def __init__(self): 
        self.Wsoup = {} 
        self.Asoup = {}
        self.Ws = {}
        self.As = {}
    
    def get_htmlA(self): 
        """This function uses the list of Agent IDs from the tables dict above
        and the grabs it using requests before putting the html reponse in Asoup"""
        
        # initialize list of pages that don't return 200
        aberrantAs = []
        # go through list of Agents from 1 to the number defined in tables
        for i in range(1, (list(tables.values())[0]+1)):
        #for i in range(1, 11):
            # make url
            url = domain+list(tables.keys())[0]+str(i) 
            # initialize connection to url 
            with requests.get(url) as r: 
                # log status code 
                logging.info(f"A{i} status code: {r.status_code}")
                # if connection is successful
                if r.status_code == 200: 
                    # make soup from html
                    s = BeautifulSoup(r.content, 'html.parser') 
                    # add to list of A soups
                    self.Asoup.update({i:s})
                else: 
                    # if connection is not successful, add to list
                    aberrantAs.append((i, r.status_code))
                    pass
                # wait a hot second or the server gets >:( 
                time.sleep(.1)
        # report list of A errors if there is one (there always is)
        if len(aberrantAs) > 0: 
            print(f"Aberrant agent pages are #s {aberrantAs}")
        else: 
            print(f"There were no aberrant agent pages!")

    def get_htmlW(self): 
        """This function uses the list of Work IDs from the tables dict above
        and the grabs it using requests before putting the html reponse in Wsoup"""
        
        # initialize list of pages that don't return 200
        aberrantWs = []
        # go through list of Words from 1 to the number defined in tables
        for i in range(1, (list(tables.values())[1]+1)):
        #for i in range(1, 11):
            # make url
            url = domain+list(tables.keys())[1]+str(i)
            # initialize connection to url 
            with requests.get(url) as r: 
                # log status code
                logging.info(f"W{i} status code: {r.status_code}")
                # if connection is successful
                if r.status_code == 200: 
                    # make soup from html
                    s = BeautifulSoup(r.content, 'html.parser')
                    # add to list of A soups
                    self.Wsoup.update({i:s})
                else: 
                    # if connection is not successful, add to list
                    aberrantWs.append((i, r.status_code))
                    pass
                # wait a hot second or the server gets >:( 
                time.sleep(.1)
        # report list of A errors if there is one (there always is)
        if len(aberrantWs) > 0: 
            print(f"Aberrant work pages are #s {aberrantWs}")
        else: 
            print(f"There were no aberrant work pages!")

    def parseWs(self, html): 
        """This function retrieves Work info from the HTML provided"""
        
        # extract things we will need
        content = html.find('div', {'class':'col-md-9 fixed-height'})
        # try to locate author text of Work
        try: 
            author = content.div.h3.text
        except: 
            author = "unknown"
        # try to locate title text of Work
        try: 
            title = content.div.h4.text
        except: 
            title = "untitled"
        # try for both stanza and para text because they return [] 
        stanza_text = content.find_all('p',{'class':'stanza'})
        prose_text = content.find_all('p',{'class':'text'})
        # decide which text to use based on length 
        if len(prose_text) > len(stanza_text): 
            text = prose_text
            genre = 'prose'
        else: 
            text = stanza_text
            genre = 'poetry'
        # get actual text 
        Wtext = [x.text.replace('\n','').strip() for x in text]
        # make list of keys of Work types
        typeKeys = [x.text[:-1] for x in html.find('div', {'class':'card-body'}).find_all('h4')]
        # make list of values of Work types 
        typeVals = [x.text for x in html.find('div', {'class':'card-body'}).find_all('p')]
        # make dictionary of keys:values from above
        typeDict = dict(zip(typeKeys, typeVals))
        # initialize sub dictionary of Work
        Wdict = {'title': title, 
                   'genre': genre,
                   'text': Wtext} 
        # add type keys and values to complete sub dictionary of Work
        Wdict.update(typeDict) 
        # return Work dict to be made into a DataFrame row
        return Wdict

    def parseAs(self, html): 
        """This function retrieves Agent info from the HTML provided"""
        
        # get Agent's name
        name = html.find('div', {'class': 'card scrollable'}).h2.text 
        # get Agent's birth- and deathdates 
        bdate, ddate = html.find('div', {'class': 'card scrollable'}).span.text.split(' - ') 
        # initialize dictionary of Agent
        Adict = {'name': name, 'birth': bdate, 'death': ddate}
        # make list of type keys
        typeKeys = [x.h4.text.lower().replace(' ','_') for x in html.find_all('div', {'class': 'col-md-4'})]
        # initialize list of type values
        typeVals = []
        # add type values to list if found or default to 'unknown' (though some real values are also 'unknown')
        for typ in html.find_all('div', {'class': 'col-md-4'}): 
            try: 
                typeVals.append(typ.p or typ.div.span.text)
            except: 
                typeVals.append("unknown")
        # for some reason keys are at different levels and require '.text.' attribute but of course some don't
        typeVals = [x.text if not isinstance(x, str) else x for x in typeVals]
        # make dictionary of keys:values from above
        typeDict = dict(zip(typeKeys, typeVals))
        # add type keys and values to complete sub dictionary of Work
        Adict.update(typeDict) 
        # return Agent dict to be made into a DataFrame row 
        return Adict
    
    def get_single(self, cat, id_num): 
        """This function combines the functions above and returns the DataFrame row""" 
        
        # make url from parameters and initialize request 
        with requests.get(domain+cat+'s/'+str(id_num)) as r: 
            # double check the URL is correct
            print(r.url)
            # check status code
            print(f"{cat+str(id_num)} status code: {r.status_code}")
            # if connection is successful
            if r.status_code == 200: 
                # make soup from html content 
                s = BeautifulSoup(r.content, 'html.parser')
                # sort by FOO type 
                if cat.lower() == 'work': 
                    # make dictionary if Work with parseWs function
                    SubDict = {id_num:self.parseWs(s)}
                elif cat.lower() == 'agent': 
                    # make dictionary if Agent with parseAs function
                    SubDict = {id_num:self.parseAs(s)} 
                else: 
                    print("You need a category: 'work' or 'agent'...") 
                # make DataFrame row from dictionary
                singleDf = pd.DataFrame.from_dict(newSubDict, orient='index')
                # return DataFrame row
                return singleDf
            # return status code if connection is unsuccessful
            else: 
                print(f"Error: {r.status_code}")


    def run(self): 
        """This function runs retrieval and parsing using the functions above, creates DataFrames, and persists them (JSON)"""
        
        logging.info(f"Getting As and Ws")
        print(f"Getting As")
        # get starting time for A retrieval
        at1 = time.time()
        # retrieve As
        self.get_htmlA() 
        # get finishing time for A retrieval
        at2 = time.time()
        # display run times for A retrieval 
        print(f"Got As in {round((at2-at1), 3)} sec ({round(((at2-at1)/list(tables.values())[0]), 5)} sec/ea.)")
        logging.info(f"Got As in {round((at2-at1), 3)} sec ({round(((at2-at1)/list(tables.values())[0]), 5)} sec/ea.)")
        
        print(f"Getting Ws")
        # get starting time for W retrieval
        wt1 = time.time()
        # retrieve As
        self.get_htmlW() 
        # get finishing time for W retrieval
        wt2 = time.time()
        # display run times for W retrieval 
        print(f"Got Ws in {round((wt2-wt1), 3)} sec ({round(((wt2-wt1)/list(tables.values())[1]), 5)} sec/ea.)")
        logging.info(f"Got Ws in {round((wt2-wt1), 3)} sec ({round(((wt2-wt1)/list(tables.values())[1]), 5)} sec/ea.)")
        logging.info(f"Done getting As and Ws")
        
        logging.info(f"Parsing As and Ws")
        print(f"Parsing As")
        # get starting time for A parsing
        pa1 = time.time()
        # parse Agent HTML instances
        self.As = {k: self.parseAs(v) for k, v in self.Asoup.items()}
        # get finishing time for A parsing
        pa2 = time.time()
        # display run times for A parsing 
        print(f"Parsed As in {round((pa2-pa1), 3)} sec ({round((pa2-pa1)/len(list(self.Asoup.items())), 5)} sec/ea.)")
        logging.info(f"Parsed As in {round((pa2-pa1), 3)} sec ({round((pa2-pa1)/len(list(self.Asoup.items())), 5)} sec/ea.)")
        
        print(f"Parsing Ws")
        # get starting time for W parsing
        pw1 = time.time()
        # parse Work HTML instances 
        self.Ws = {k: self.parseWs(v) for k, v in self.Wsoup.items()}
        # get finishing time for W parsing
        pw2 = time.time() 
        # display run times for W parsing 
        print(f"Parsed Ws in {round((pw2-pw1), 3)} sec ({round((pw2-pw1)/len(list(self.Wsoup.items())), 5)} sec/ea.)")
        logging.info(f"Parsed Ws in {round((pw2-pw1), 3)} sec ({round((pw2-pw1)/len(list(self.Wsoup.items())), 5)} sec/ea.)")
        logging.info(f"Done parsing As and Ws")
        
        logging.info(f"Making dataframes")
        print(f"Making AsDf")
        # create DataFrame of Agent dictionaries
        AsDf = pd.DataFrame.from_dict(self.As, orient='index')
        print(f"Making WsDf")
        # create DataFrame of Work dictionaries
        WsDf = pd.DataFrame.from_dict(self.Ws, orient='index')  
        logging.info(f"Done making dataframes")
        
        logging.info(f"Writing to json")
        # write Work DataFrame to JSON
        WsDf.to_json('WsDf.json')
        # write Agent DataFrame to JSON
        AsDf.to_json('AsDf.json')
        logging.info(f"Done writing to json")

In [None]:
def check_status(urls): 
    aberrantURLs = []
    logging.info(f"Checking status of URLs")
    for url in urls: 
        logging.info(f"Trying {url}")
        with requests.get(url) as r: 
            if r.status_code == 200: 
                #logging.info(f"{url} successful")
                pass
            else: 
                logging.info(f"{url}: {r.status_code}")
                aberrantURLs.append({url: r.status_code})

In [10]:
%%time
if __name__ == '__main__': 
    maprr().run()

Getting As
Aberrant agent pages are #s [(74, 404), (75, 404), (76, 404), (77, 404), (139, 404), (140, 404), (192, 404), (206, 404), (252, 404)]
Got As in 216.721 sec (0.7129 sec/ea.)
Getting Ws
Aberrant work pages are #s [(173, 500), (179, 500), (261, 500), (306, 500), (425, 500), (433, 500), (434, 500), (435, 500), (439, 500), (526, 500), (554, 500), (576, 500), (577, 500), (578, 500), (579, 500), (581, 500), (598, 500)]
Got Ws in 525.414 sec (0.87133 sec/ea.)
Parsing As
Parsed As in 0.393 sec (0.00133 sec/ea.)
Parsing Ws
Parsed Ws in 0.628 sec (0.00107 sec/ea.)
Making AsDf
Making WsDf
CPU times: user 1min 22s, sys: 1.64 s, total: 1min 24s
Wall time: 12min 23s


In [11]:
AsDf = pd.read_json('AsDf.json')
AsDf['birth'] = pd.to_datetime(AsDf['birth'], errors='coerce', infer_datetime_format=True)
AsDf['death'] = pd.to_datetime(AsDf['death'], errors='coerce', infer_datetime_format=True)
AsDf

Unnamed: 0,name,birth,death,type_of_agent,sex,occupations,family_social_strata,literary_affiliations,political_affiliations,type_of_corporate_body,affiliation
1,Anna Akhmatova,1889-06-23,1966-03-05,person,female,poet,nobility,Acmeism,independent,,
2,Vasilii Dmitrievich Aleksandrovskii,1897-01-15,1934-11-16,person,male,soldier,peasant,Kuznitsa,Bolshevik member,,
3,Ivan Nikolaevich Antonov,NaT,NaT,person,male,editor,unknown,unknown,independent,,
4,Mikhail Dmitrievich Artamonov,1888-02-22,1958-11-22,person,male,journalist,peasant,Vologda poets,unknown,,
5,Nikolai Aseev,1889-07-10,1963-07-16,person,male,soldier,nobility,Left Front of Art: LEF,Bolshevik member,,
...,...,...,...,...,...,...,...,...,...,...,...
300,Moisei Solomonovich Uritskii,1873-01-14,1918-08-30,person,male,revolutionary,merchant,unknown,Bolshevik member,,
301,Maximilien Marie Isidore de Robespierre,1758-05-06,1794-06-28,person,male,unknown,professional,unknown,unknown,,
302,Iurii Mikhailovich Steklov,1873-08-27,1941-07-15,person,male,revolutionary,unknown,unknown,Socialist Revolutionary,,
303,Christian August Friedrich Peters,1806-09-07,1880-05-08,person,male,unknown,unknown,unknown,unknown,,


## Dataframe Split

### libDf

In [20]:
libDf = pd.read_json('WsDf.json')
libDf.index.name = 'w_id'
libDf.columns = lib_cols
libDf['comp_date'] = pd.to_datetime(libDf['comp_date'], errors='coerce')
libDf['pub_year'] = pd.to_datetime(libDf['pub_year'], errors='coerce')
libDf['pub_year'] = libDf.pub_year.apply(lambda x: x.year).astype('int64', errors='ignore')
print(libDf.shape)
#type(libDf.loc[1, 'pub_year'])
libDf

(586, 12)


numpy.float64

### authorsDf

In [21]:
authorsDf = libDf.reset_index().groupby('author').size().to_frame().rename(columns={0:'num_works'})
authorsDf['num_words'] = libDf.reset_index().groupby('author').sum().num_words
authorsDf['avg_wpw'] = round(authorsDf.num_words/authorsDf.num_works, 2)
authorsDf = authorsDf.reset_index().sort_values(by=['avg_wpw'], ascending=False).rename(columns={'author':'name'})
authorsDf = pd.merge(AsDf.reset_index(), authorsDf.reset_index(), how='right', on='name').set_index('index_x')
authorsDf.index.name = 'a_id'
authorsDf.to_json('authorsDf.json', date_format='iso')
authorsDf

AttributeError: 'DataFrame' object has no attribute 'num_words'

In [99]:
authorsDf = pd.read_json('authorsDf.json')
#authorsDf[['birth', 'death']] = authorsDf[['birth', 'death']].apply(pd.to_datetime, format="%Y-%m-%d")
#[dt.to_datetime().date() for dt in authorsDf[['birth', 'death']]]
authorsDf

Unnamed: 0,name,birth,death,type_of_agent,sex,occupations,family_social_strata,literary_affiliations,political_affiliations,type_of_corporate_body,affiliation,index_y,num_works,num_words,avg_wpw
16,Aleksandr Vasil'evich Chaianov,1888-01-29T00:00:00.000Z,1937-10-03T00:00:00.000Z,person,male,writer,merchant,unknown,agricultural cooperativist,,,8,1,4591,4591.00
57,Mikhail Mikhailovich Prishvin,1873-02-04T00:00:00.000Z,1954-01-16T00:00:00.000Z,person,male,writer,merchant,unknown,nationalist,,,44,1,3067,3067.00
40,Aleksandra Kollontai,1872-03-31T00:00:00.000Z,1952-03-09T00:00:00.000Z,person,female,activist,nobility,unknown,Menshevik,,,9,1,1082,1082.00
65,S. D. Spasskii,1898-12-21T00:00:00.000Z,1956-08-24T00:00:00.000Z,person,male,writer,professional,Futurism,independent,,,56,1,826,826.00
17,Evgenii Nikolaevich Chirikov,1864-08-05T00:00:00.000Z,1932-01-18T00:00:00.000Z,person,male,writer,nobility,Gor'kii circle,Populist,,,25,4,3064,766.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,Mikhail Dmitrievich Artamonov,1888-02-22T00:00:00.000Z,1958-11-22T00:00:00.000Z,person,male,journalist,peasant,Vologda poets,unknown,,,43,9,387,43.00
66,Sergei Stradnyi,,,person,male,poet,peasant,Smolensk Proletkult,Proletkult Bolshevik,,,63,7,300,42.86
19,Pavel Leonidovich Daletskii,1898-02-02T00:00:00.000Z,1963-07-08T00:00:00.000Z,person,male,poet,professional,unknown,unknown,,,53,3,124,41.33
70,Boris Virganskii,,,person,male,poet,unknown,unknown,Bolshevik-leaning,,,20,6,207,34.50


In [100]:
type(authorsDf.loc[15, 'death'])

str

### worksDf

In [101]:
textsDf = libDf[['text']]
textsDf['num_words'] = libDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))
textsDf.sort_values('num_words', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textsDf['num_words'] = libDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))


Unnamed: 0_level_0,text,num_words
w_id,Unnamed: 1_level_1,Unnamed: 2_level_1
140,[в которой благосклонный читатель знакомится с...,4591
403,"[Случалось, на огонек во время перелета, или в...",3067
551,[Так как Волга была великой исторической дорог...,1616
61,"[Вы, которым шестьдесят лет, или даже вы, кото...",1570
142,[Волга! Одна из значительнейших рек всего земн...,1234
...,...,...
523,[Ветер забил с Балта. Мглой упил...,17
479,[Белогвардейцы! Гордиев узел ...,14
482,"[Под рокот гражданских бурь, В л...",14
400,[Пусть стихи мои развеют Ва...,13


In [102]:
worksDf = libDf[['title','year','author','genre','num_lps','num_words']]
worksDf

KeyError: "['title', 'genre', 'year', 'num_lps'] not in index"

### tokenDf

In [31]:
lpDf = libDf[['text']]
lpDf = lpDf.text.apply(lambda x: pd.Series([y for y in x])).stack().to_frame().rename(columns={0:'lp_str'})
lpDf.index.names = OHCO[:2]
lpDf
tokenDf = lpDf.lp_str.apply(lambda x: tokenize(x)).to_frame()#.rename(columns={0:'token'})
#tokenDf = lpDf.lp_str.apply(lambda x: y.text for y in tokenize(x)[1])
tokenDf

  lpDf = lpDf.text.apply(lambda x: pd.Series([y for y in x])).stack().to_frame().rename(columns={0:'lp_str'})


NameError: name 'OHCO' is not defined

In [None]:
for i in tokenize(lpDf.lp_str): 
    print(i)