# MAPRR Textual Analytics

## Intro

### Import


In [1]:
import os 
import time
import logging
from threading import *
import concurrent
import random
import pickle
import json
import pandas as pd 
import re
import requests 
from bs4 import BeautifulSoup
from natasha import (
    Segmenter, 
    MorphVocab, 
    NewsEmbedding, 
    NewsMorphTagger, 
    NewsSyntaxParser, 
    NewsNERTagger, 
    PER, 
    NamesExtractor, 
    Doc)
from razdel import tokenize

In [2]:
logging.basicConfig(filename='maprr_out.log', encoding='utf-8', format='%(asctime)s %(message)s', level=logging.INFO)

In [42]:
lib_cols = ['w_id', 'title_ru', 'text', 'title_en', '1st_line', 'author', 'comp_date', 'comp_loc', 'pub_src', '1st_pub', 'pub_year', 'pub_loc']
a_cols = ['name', 'birth', 'death', 'a_type', 'sex', 'occs', 'fam_soc_str', 'lit_affil', 'pol_affil', 'corp_type', 'corp_affil']

## Pre-processing Function

In [4]:
domain = 'https://maprr.iath.virginia.edu/'
max_threads = 3
tables = {
    'agents/': 332, 
    'works/': 655, 
    'place_based_concepts/': 316, 
    'locations/': 380, 
    'multivalent_markers/': 449
}
tables = {
    'agents/': 3,
    'works/': 6,
    'place_based_concepts/': 3, 
    'locations/': 3,
    'multivalent_markers/': 4
}

In [None]:
allURLs = [domain+item[0]+str(i) for item in tables.items() for i in range(1, item[1]+1)]

In [None]:
urls_to_visit = [] 
for t, i in list(tables.items())[:2]: 
    for j in range(0,i+1): 
        urls_to_visit.append(domain+t+str(j))

In [None]:
class maprr: 
    
    def __init__(self): 
        self.Wsoup = {} 
        self.Asoup = {}
        self.Ws = {}
        self.As = {}
    
    def get_htmlA(self): 
        """This function uses the list of Agent IDs from the tables dict above
        and the grabs it using requests before putting the html reponse in Asoup"""
        
        # initialize list of pages that don't return 200
        aberrantAs = []
        # go through list of Agents from 1 to the number defined in tables
        for i in range(1, (list(tables.values())[0]+1)):
        #for i in range(1, 11):
            # make url
            url = domain+list(tables.keys())[0]+str(i) 
            # initialize connection to url 
            with requests.get(url, verify=False) as r: 
                # log status code 
                logging.info(f"A{i} status code: {r.status_code}")
                # if connection is successful
                if r.status_code == 200: 
                    # make soup from html
                    s = BeautifulSoup(r.content, 'html.parser') 
                    # add to list of A soups
                    self.Asoup.update({i:s})
                else: 
                    # if connection is not successful, add to list
                    aberrantAs.append((i, r.status_code))
                    pass
                # wait a hot second or the server gets >:( 
                time.sleep(.1)
        # report list of A errors if there is one (there always is)
        if len(aberrantAs) > 0: 
            print(f"Aberrant agent pages are #s {aberrantAs}")
        else: 
            print(f"There were no aberrant agent pages!")

    def get_htmlW(self): 
        """This function uses the list of Work IDs from the tables dict above
        and the grabs it using requests before putting the html reponse in Wsoup"""
        
        # initialize list of pages that don't return 200
        aberrantWs = []
        # go through list of Words from 1 to the number defined in tables
        for i in range(1, (list(tables.values())[1]+1)):
        #for i in range(1, 11):
            # make url
            url = domain+list(tables.keys())[1]+str(i)
            # initialize connection to url 
            with requests.get(url, verify=False) as r: 
                # log status code
                logging.info(f"W{i} status code: {r.status_code}")
                # if connection is successful
                if r.status_code == 200: 
                    # make soup from html
                    s = BeautifulSoup(r.content, 'html.parser')
                    # add to list of A soups
                    self.Wsoup.update({i:s})
                else: 
                    # if connection is not successful, add to list
                    aberrantWs.append((i, r.status_code))
                    pass
                # wait a hot second or the server gets >:( 
                time.sleep(.1)
        # report list of A errors if there is one (there always is)
        if len(aberrantWs) > 0: 
            print(f"Aberrant work pages are #s {aberrantWs}")
        else: 
            print(f"There were no aberrant work pages!")

    def parseWs(self, html): 
        """This function retrieves Work info from the HTML provided"""
        
        # extract things we will need
        content = html.find('div', {'class':'col-md-9 fixed-height'})
        # try to locate author text of Work
        try: 
            author = content.div.h3.text
        except: 
            author = "unknown"
        # try to locate title text of Work
        try: 
            title = content.div.h4.text
        except: 
            title = "untitled"
        # try for both stanza and para text because they return [] 
        stanza_text = content.find_all('p',{'class':'stanza'})
        prose_text = content.find_all('p',{'class':'text'})
        # decide which text to use based on length 
        if len(prose_text) > len(stanza_text): 
            text = prose_text
            genre = 'prose'
        else: 
            text = stanza_text
            genre = 'poetry'
        # get actual text 
        Wtext = [x.text.replace('\n','').strip() for x in text]
        # make list of keys of Work types
        typeKeys = [x.text[:-1] for x in html.find('div', {'class':'card-body'}).find_all('h4')]
        # make list of values of Work types 
        typeVals = [x.text for x in html.find('div', {'class':'card-body'}).find_all('p')]
        # make dictionary of keys:values from above
        typeDict = dict(zip(typeKeys, typeVals))
        # initialize sub dictionary of Work
        Wdict = {'title': title, 
                   'genre': genre,
                   'text': Wtext} 
        # add type keys and values to complete sub dictionary of Work
        Wdict.update(typeDict) 
        # return Work dict to be made into a DataFrame row
        return Wdict

    def parseAs(self, html): 
        """This function retrieves Agent info from the HTML provided"""
        
        # get Agent's name
        name = html.find('div', {'class': 'card scrollable'}).h2.text 
        # get Agent's birth- and deathdates 
        bdate, ddate = html.find('div', {'class': 'card scrollable'}).span.text.split(' - ') 
        # initialize dictionary of Agent
        Adict = {'name': name, 'birth': bdate, 'death': ddate}
        # make list of type keys
        typeKeys = [x.h4.text.lower().replace(' ','_') for x in html.find_all('div', {'class': 'col-md-4'})]
        # initialize list of type values
        typeVals = []
        # add type values to list if found or default to 'unknown' (though some real values are also 'unknown')
        for typ in html.find_all('div', {'class': 'col-md-4'}): 
            try: 
                typeVals.append(typ.p or typ.div.span.text)
            except: 
                typeVals.append("unknown")
        # for some reason keys are at different levels and require '.text.' attribute but of course some don't
        typeVals = [x.text if not isinstance(x, str) else x for x in typeVals]
        # make dictionary of keys:values from above
        typeDict = dict(zip(typeKeys, typeVals))
        # add type keys and values to complete sub dictionary of Work
        Adict.update(typeDict) 
        # return Agent dict to be made into a DataFrame row 
        return Adict
    
    def get_single(self, cat, id_num): 
        """This function combines the functions above and returns the DataFrame row""" 
        
        # make url from parameters and initialize request 
        with requests.get(domain+cat+'s/'+str(id_num), verify=False) as r: 
            # double check the URL is correct
            print(r.url)
            # check status code
            print(f"{cat+str(id_num)} status code: {r.status_code}")
            # if connection is successful
            if r.status_code == 200: 
                # make soup from html content 
                s = BeautifulSoup(r.content, 'html.parser')
                # sort by FOO type 
                if cat.lower() == 'work': 
                    # make dictionary if Work with parseWs function
                    SubDict = {id_num:self.parseWs(s)}
                elif cat.lower() == 'agent': 
                    # make dictionary if Agent with parseAs function
                    SubDict = {id_num:self.parseAs(s)} 
                else: 
                    print("You need a category: 'work' or 'agent'...") 
                # make DataFrame row from dictionary
                singleDf = pd.DataFrame.from_dict(newSubDict, orient='index')
                # return DataFrame row
                return singleDf
            # return status code if connection is unsuccessful
            else: 
                print(f"Error: {r.status_code}") 
                
    def save_obj(self, obj, name):
        with open(name + '.pkl', 'wb+') as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_obj(self, name):
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)


    def run(self): 
        """This function runs retrieval and parsing using the functions above, creates DataFrames, and persists them (JSON)"""
        
        logging.info(f"Getting As and Ws")
        print(f"Getting As")
        # get starting time for A retrieval
        at1 = time.time()
        # retrieve As
        self.get_htmlA() 
        # get finishing time for A retrieval
        at2 = time.time()
        # display run times for A retrieval 
        logging.info(f"Got As in {round((at2-at1), 3)} sec ({round(((at2-at1)/list(tables.values())[0]), 5)} sec/ea.)")
        
        print(f"Getting Ws")
        # get starting time for W retrieval
        wt1 = time.time()
        # retrieve As
        self.get_htmlW() 
        # get finishing time for W retrieval
        wt2 = time.time()
        # display run times for W retrieval 
        print(f"Got Ws in {round((wt2-wt1), 3)} sec ({round(((wt2-wt1)/list(tables.values())[1]), 5)} sec/ea.)")
        logging.info(f"Got Ws in {round((wt2-wt1), 3)} sec ({round(((wt2-wt1)/list(tables.values())[1]), 5)} sec/ea.)")
        
        logging.info(f"Done getting As and Ws")
        
        self.save_obj(self.Asoup, 'Asoup')
        self.save_obj(self.Wsoup, 'Wsoup')
        
        logging.info(f"Parsing As and Ws")
        
        logging.info(f"Parsing As")
        print(f"Parsing As")
        # get starting time for A parsing
        pa1 = time.time()
        # parse Agent HTML instances
        self.As = {k: self.parseAs(v) for k, v in self.Asoup.items()}
        # get finishing time for A parsing
        pa2 = time.time()
        # display run times for A parsing 
        logging.info(f"Parsed As in {round((pa2-pa1), 3)} sec ({round((pa2-pa1)/len(list(self.Asoup.items())), 5)} sec/ea.)")
        
        print(f"Parsing Ws")
        # get starting time for W parsing
        pw1 = time.time()
        # parse Work HTML instances 
        self.Ws = {k: self.parseWs(v) for k, v in self.Wsoup.items()}
        # get finishing time for W parsing
        pw2 = time.time() 
        # display run times for W parsing 
        print(f"Parsed Ws in {round((pw2-pw1), 3)} sec ({round((pw2-pw1)/len(list(self.Wsoup.items())), 5)} sec/ea.)")
        logging.info(f"Parsed Ws in {round((pw2-pw1), 3)} sec ({round((pw2-pw1)/len(list(self.Wsoup.items())), 5)} sec/ea.)")
        logging.info(f"Done parsing As and Ws")
        
        logging.info(f"Making dataframes")
        print(f"Making AsDf")
        # create DataFrame of Agent dictionaries
        AsDf = pd.DataFrame.from_dict(self.As, orient='index')
        print(f"Making WsDf")
        # create DataFrame of Work dictionaries
        WsDf = pd.DataFrame.from_dict(self.Ws, orient='index')  
        logging.info(f"Done making dataframes")
        
        logging.info(f"Writing to json")
        # write Work DataFrame to JSON
        WsDf.to_json('WsDf.json')
        # write Agent DataFrame to JSON
        AsDf.to_json('AsDf.json')
        logging.info(f"Done writing to json")

In [None]:
def check_status(urls): 
    aberrantURLs = []
    logging.info(f"Checking status of URLs")
    for url in urls: 
        logging.info(f"Trying {url}")
        with requests.get(url, verify=False) as r: 
            if r.status_code == 200: 
                #logging.info(f"{url} successful")
                pass
            else: 
                logging.info(f"{url}: {r.status_code}")
                aberrantURLs.append({url: r.status_code})

In [None]:
if __name__ == '__main__': 
    maprr().run()

In [22]:
class ParallelMAPRR: 
    
    global domain, tables, max_threads
    
    def __init__(self): 
        self.urls_to_visit = []
        self.aberrantAs = []
        self.aberrantWs = []
        self.soups = {}
        self.Wsoup = {} 
        self.Asoup = {}
        self.WsDict = {}
        self.AsDict = {}
    
    def get_html(self, url): 
        url_format = 'https://maprr.iath.virginia.edu/(\w+)/(\d{1,3})'
        url_match = re.match(url_format, url)
        fco_type = url_match.group(1)
        id_num = url_match.group(2)
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    }
        with requests.get(url, headers=headers) as r: 
            logging.info(f"{fco_type}/{id_num} status code: {r.status_code}")
            if r.status_code == 200: 
                s = BeautifulSoup(r.content, 'html.parser')
                self.soups.update({id_num:s})
                if fco_type == 'agents': 
                    self.Asoup.update({id_num:s})
                elif fco_type == 'works': 
                    self.Wsoup.update({id_num:s})
            else: 
                if fco_type == 'agents': 
                    self.aberrantAs.append({'A'+str(id_num): r.status_code})
                elif fco_type == 'works': 
                    self.aberrantWs.append({'W'+str(id_num): r.status_code})
                pass
        time.sleep(.5)
            
    def downloadHTML(self): 
        threads = min(max_threads, len(self.urls_to_visit)) 
        print(f"Downloading with {threads} threads")
        with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 
            executor.map(self.get_html, self.urls_to_visit)

    def parse_html(self, html): 
        _attrs = list(html.body.attrs.values())[0]
        logging.info(f"html body attrs: {_attrs}")
        
        if 'works' in _attrs:
            logging.info("parsing W")
            content = html.find('div', {'class':'col-md-9 fixed-height'})
            try: 
                author = content.div.h3.text
            except: 
                author = ""
            logging.info(f"author: {author}")
            try: 
                title = content.div.h4.text
            except: 
                title = ""
            logging.info(f"title: {title}")
            stanza_text = content.find_all('p',{'class':'stanza'})
            prose_text = content.find_all('p',{'class':'text'})
            if len(stanza_text) > len(prose_text): 
                text = stanza_text
            elif len(stanza_text) < len(prose_text): 
                text = prose_text
            Wtext = [x.text.replace('\n','').strip() for x in text]
            metaKeys = [x.text[:-1] for x in html.find('div', {'class':'card-body'}).find_all('h4')]
            metaVals = [x.text for x in html.find('div', {'class':'card-body'}).find_all('p')]
            metaDict = dict(zip(metaKeys, metaVals))
            subDict = {'title': title, 
                       'text': Wtext}
            subDict.update(metaDict)
            logging.info(f"W subdict: {subDict}")
            self.WsDict.update(subDict)
            #return subDict

        elif 'agents' in _attrs: 
            logging.info("parsing A")
            name = html.find('div', {'class': 'card scrollable'}).h2.text
            bdate, ddate = html.find('div', {'class': 'card scrollable'}).span.text.split(' - ')
            subDict = {'name': name, 'birth': bdate, 'death': ddate}
            try: 
                typeKeys = [x.h4.text for x in html.find_all('div', {'class': 'col-md-4'})]
                typeVals = [x.p or x.div.span.text for x in html.find_all('div', {'class': 'col-md-4'})]
                typeVals[:2] = [x.text for x in typeVals[:2]]
                typeDict = dict(zip(typeKeys, typeVals))
                subDict.update(typeDict)
            except: 
                pass

            logging.info(f"A subdict: {subDict}")
            self.AsDict.update(subDict)
            #return subDict
        
        else: 
            logging.info(f"Something went wrong while parsing")
            pass
    
    def parseHTML(self): 
        threads = min(max_threads, len(self.urls_to_visit)) 
        logging.info(f"Parsing with {threads} threads")
        with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor: 
            executor.map(self.parse_html, list(self.soups.values()))
        
    def run(self): 
        for t, i in list(tables.items())[:2]: 
            for j in range(1,5): 
                self.urls_to_visit.append(domain+t+str(j))
                
        #logging.info(f"urls: {self.urls_to_visit}")
        
        logging.info(f"Getting As and Ws")
        self.downloadHTML()
        logging.info(f"Done getting As and Ws")

        logging.info(f"soups: {list(self.soups.values())}")
        #logging.info(f"Wsoup: {self.Wsoup}")
        logging.info(f"Abberrant Ws: {self.aberrantWs}")
        logging.info(f"Abberrant As: {self.aberrantAs}")
        
        logging.info(f"Parsing As and Ws")
        self.parseHTML()
        logging.info(f"Done parsing As and Ws")

        #logging.info(f"Making AsDf")
        logging.info(f"As: {self.AsDict}")
        #AsDf = pd.DataFrame.from_dict(self.AsDict)
        #logging.info(f"Making WsDf")
        logging.info(f"Ws: {self.WsDict}")
        #WsDf = pd.DataFrame.from_dict(self.WsDict)        
        #logging.info(f"Done making dataframes")
        
        #logging.info(f"Writing to json")
        #WsDf.to_json('WsDf.json')
        #AsDf.to_json('AsDf.json')
        #logging.info(f"Done writing to json")

In [23]:
%%time
if __name__ == '__main__': 
    ParallelMAPRR().run()

Downloading with 3 threads
CPU times: user 455 ms, sys: 66.4 ms, total: 522 ms
Wall time: 2.93 s


In [None]:
objects = []

with (open('/home/xtra/code/maprr/Wsoup.pkl', "rb")) as f:
    objects.append(pickle.load(f))

In [None]:
"works" in list(list(objects[0].values())[0].html.body.attrs.values())[0]

In [None]:
list(list(objects[0].values())[0].html.body.attrs.values())[0]

In [None]:
list(objects[0].values())[0].html.body.find('div', {'class':'col-md-9 fixed-height'})

In [33]:
AsDf = pd.read_csv('AsDf.csv')
AsDf['birth'] = pd.to_datetime(AsDf['birth'], errors='coerce', infer_datetime_format=True)
AsDf['death'] = pd.to_datetime(AsDf['death'], errors='coerce', infer_datetime_format=True)
AsDf

Unnamed: 0.1,Unnamed: 0,name,birth,death,Type of Agent,Sex,Occupations,Family Social Strata,Literary Affiliations,Political Affiliations,Type of Corporate Body,Affiliation
0,1,Anna Akhmatova,1889-06-23,1966-03-05,person,female,poet,nobility,Acmeism,independent,,
1,2,Vasilii Dmitrievich Aleksandrovskii,1897-01-15,1934-11-16,person,male,soldier,peasant,Kuznitsa,Bolshevik member,,
2,3,Ivan Nikolaevich Antonov,NaT,NaT,person,male,editor,unknown,unknown,independent,,
3,4,Mikhail Dmitrievich Artamonov,1888-02-22,1958-11-22,person,male,journalist,peasant,Vologda poets,unknown,,
4,5,Nikolai Aseev,1889-07-10,1963-07-16,person,male,soldier,nobility,Left Front of Art: LEF,Bolshevik member,,
...,...,...,...,...,...,...,...,...,...,...,...,...
285,300,Moisei Solomonovich Uritskii,1873-01-14,1918-08-30,,,,,,,,
286,301,Maximilien Marie Isidore de Robespierre,1758-05-06,1794-06-28,,,,,,,,
287,302,Iurii Mikhailovich Steklov,1873-08-27,1941-07-15,,,,,,,,
288,303,Christian August Friedrich Peters,1806-09-07,1880-05-08,,,,,,,,


## Dataframe Split

### libDf

In [44]:
libDf = pd.read_csv('WsDf.csv')
#libDf.index.name = 'w_id'
libDf.columns = lib_cols
libDf['comp_date'] = pd.to_datetime(libDf['comp_date'], errors='coerce')
libDf['pub_year'] = pd.to_datetime(libDf['pub_year'], errors='coerce')
libDf['pub_year'] = libDf.pub_year.apply(lambda x: x.year).astype('int64', errors='ignore')
libDf['num_words'] = libDf.text.str.len()
print(libDf.shape)
#type(libDf.loc[1, 'pub_year'])
libDf

(586, 13)


Unnamed: 0,w_id,title_ru,text,title_en,1st_line,author,comp_date,comp_loc,pub_src,1st_pub,pub_year,pub_loc,num_words
0,1,Untitled,"['Сразу стало тихо в доме, Обле...",no title,Srazu stalo tikho v dome…,Anna Akhmatova,1917-07-01,Slepnevo,Podorozhnik,Petropolis,1921.0,,432
1,2,Untitled,['Ты — отступник: за остров зелёный ...,no title,Ty - otstupnik: za ostrov zelenyi…,Anna Akhmatova,NaT,Slepnevo,Podorozhnik,Petropolis,1921.0,,815
2,3,Untitled,['Просыпаться на рассвете Оттог...,no title,Prosypat'sia na rassvete…,Anna Akhmatova,1917-07-01,Slepnevo,Podorozhnik,Petropolis,1921.0,,470
3,4,Untitled,"['И в тайную дружбу с высоким, ...",no title,I v tainuiu druzhbu c vysokim…,Anna Akhmatova,1917-01-01,Petrograd,Podorozhnik,Petropolis,1921.0,,284
4,5,Untitled,"['Словно ангел, возмутивший воду, ...",no title,"Slovno angel, vozmutivshii vodu…",Anna Akhmatova,1916-02-01,Tsarskoe selo,Podorozhnik,Petropolis,1921.0,,336
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,599,Untitled,"['Любовь распяли на кресте, Но в...","""Liubov' raspiali na kreste""",Liubov' raspiali na kreste,Georgii Andreevich Viatkin,NaT,Omsk,Ranenaia Rossiia: Stikhi; Vernost': rasskaz; E...,Tipografiia Vremennogo Tsentral’nogo Voenno-P...,1919.0,Ekaterinburg,457
582,600,На словах...,[],Na slovakh... (Nesvoevremennye mysli),Na slovakh--vse soglasny...,Maksim Gor'kii,1917-06-29,Petrograd,Novaia zhizn',A. N. Tikhonov,1917.0,Petrograd,2
583,601,Последняя просьба,"['Сестра!.. Сестрица, на минутку подойдите ...",Posledniaia pros'ba,"Sestra! Sestritsa, na minutku podoidite…",M Kolchin,NaT,,Pesni voiny: posviashchaetsia doblestnym sibir...,Tipografiia I. M. Poznera,1915.0,,824
584,602,И рек Сидящий на престоле,"['В борьбе с врагом, в борьбе кровавой, геройс...",I rek Sidiashchii na prestole,"V bor'be s vragom, v bor'be krovavom...",M. Did,NaT,,"Nabat: Stikhotvoreniia, 1914-1915",Tipografiia N. A. Vorob'eva,1916.0,,1624


### authorsDf

In [45]:
authorsDf = libDf.reset_index().groupby('author').size().to_frame().rename(columns={0:'num_works'})
authorsDf['num_words'] = libDf.reset_index().groupby('author').sum().num_words
authorsDf['avg_wpw'] = round(authorsDf.num_words/authorsDf.num_works, 2)
authorsDf = authorsDf.reset_index().sort_values(by=['avg_wpw'], ascending=False).rename(columns={'author':'name'})
authorsDf = pd.merge(AsDf.reset_index(), authorsDf.reset_index(), how='right', on='name').set_index('index_x')
authorsDf.index.name = 'a_id'
authorsDf.to_json('authorsDf.json', date_format='iso')
authorsDf

Unnamed: 0_level_0,Unnamed: 0,name,birth,death,Type of Agent,Sex,Occupations,Family Social Strata,Literary Affiliations,Political Affiliations,Type of Corporate Body,Affiliation,index_y,num_works,num_words,avg_wpw
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
63.0,65.0,S. D. Spasskii,1898-12-21,1956-08-24,person,male,writer,professional,Futurism,independent,,,56,1,10654,10654.00
23.0,24.0,Evsei Davydovich Erkin,NaT,1942-12-06,person,male,writer,unknown,Pereval,Bolshevik member,,,26,1,10471,10471.00
9.0,10.0,Aleksandr Il'ich Bezymenskii,1898-01-19,1973-06-06,person,male,poet,unknown,Kuznitsa,Komsomol,,,3,4,21643,5410.75
11.0,12.0,Fëdor Semënovich Bogorodskii,1895-06-02,1959-11-03,person,male,painter,professional,Futurism,Bolshevik member,,,27,5,24528,4905.60
10.0,11.0,Aleksandr Aleksandrovich Blok,1880-11-28,1921-08-07,person,male,poet,nobility,Symbolism,unknown,,,0,4,18077,4519.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16.0,17.0,Evgenii Nikolaevich Chirikov,1864-08-05,1932-01-18,person,male,writer,nobility,Gor'kii circle,Populist,,,25,4,8,2.00
61.0,63.0,Viacheslav Iakovlevich Shishkov,1873-10-03,1945-03-06,person,male,explorer,merchant,unknown,nationalist,,,68,2,4,2.00
15.0,16.0,Aleksandr Vasil'evich Chaianov,1888-01-29,1937-10-03,,,,,,,,,8,1,2,2.00
5.0,6.0,Arkadii Timofeevich Averchenko,1880-03-27,1925-03-12,person,male,satirist,merchant,New Satirikon circle,nationalist,,,17,9,18,2.00


In [None]:
authorsDf = pd.read_json('authorsDf.json')
#authorsDf[['birth', 'death']] = authorsDf[['birth', 'death']].apply(pd.to_datetime, format="%Y-%m-%d")
#[dt.to_datetime().date() for dt in authorsDf[['birth', 'death']]]
authorsDf

In [None]:
type(authorsDf.loc[15, 'death'])

### worksDf

In [46]:
textsDf = libDf[['text']]
textsDf['num_words'] = libDf['num_words'] = textsDf.text.apply(lambda k: len([a for b in [x.split() for y in k for x in y.split('               ')] for a in b if a.isalpha() == True]))
textsDf.sort_values('num_words', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,text,num_words
108,"['Посвящаю эту повесть двухлетнему Грише, ...",7758
114,['Каждым рявканьем пушечным ...,7706
134,"['Защищал вход в Сербию, словно стая львов, ...",5941
585,['Черный ветер. Белый снег. ...,5664
430,"['Как ветер с разбегу парус полощет, ...",5467
...,...,...
61,[],0
221,[],0
220,[],0
219,[],0


In [47]:
worksDf = libDf[['title','year','author','genre','num_lps','num_words']]
worksDf

KeyError: "['title', 'year', 'genre', 'num_lps'] not in index"

### tokenDf

In [48]:
lpDf = libDf[['text']]
lpDf = lpDf.text.apply(lambda x: pd.Series([y for y in x])).stack().to_frame().rename(columns={0:'lp_str'})
lpDf.index.names = OHCO[:2]
lpDf
tokenDf = lpDf.lp_str.apply(lambda x: tokenize(x)).to_frame()#.rename(columns={0:'token'})
#tokenDf = lpDf.lp_str.apply(lambda x: y.text for y in tokenize(x)[1])
tokenDf

NameError: name 'OHCO' is not defined

In [None]:
for i in tokenize(lpDf.lp_str): 
    print(i)