In [45]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
import lucem_illud #just in case, regularly update your lucem_illud with the following code: pip install git+git://github.com/UChicago-Computational-Content-Analysis/lucem_illud.git

#All these packages need to be installed from pip
import requests #for http requests
import pandas #gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import numpy as np #For divergences/distances
import scipy #For divergences/distances
import seaborn as sns #makes our plots look nicer
import sklearn.manifold #For a manifold plot
import json #For API responses
import urllib.parse #For joining urls

# comp-linguistics
import spacy

#Displays the graphs
import graphviz #You also need to install the command line graphviz

#These are from the standard library
import os.path
import zipfile
import subprocess
import io
import tempfile

import wordcloud #Makes word clouds

#All these packages need to be installed from pip
import requests #for http requests
import bs4 #called `beautifulsoup4`, an html parser
import pandas #gives us DataFrames
import docx #reading MS doc files, install as `python-docx`

#Stuff for pdfs
#Install as `pdfminer2`
import pdfminer.pdfinterp
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfpage

#These come with Python
import re #for regexs
import urllib.parse #For joining urls
import io #for making http requests look like files
import json #For Tumblr API responses
import os.path #For checking if files exist
import os #For making directories

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook
%matplotlib inline

import re

In [2]:
import nltk
nlp = spacy.load('en_core_web_sm')

def word_tokenize(word_list):
    tokenized = []
    # pass word list through language model.
    doc = nlp(word_list)
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

def wordCounter(wordLst):
    wordCounts = {}
    for word in wordLst:
        #We usually need to normalize the case
        wLower = word.lower()
        if wLower in wordCounts:
            wordCounts[wLower] += 1
        else:
            wordCounts[wLower] = 1
    #convert to DataFrame
    countsForFrame = {'word' : [], 'count' : []}
    for w, c in wordCounts.items():
        countsForFrame['word'].append(w)
        countsForFrame['count'].append(c)
    return pandas.DataFrame(countsForFrame)

In [55]:
def make_soup(url):
    '''
    A helper function to simply making a BeautifulSoup object from a function.
    Input: a full URL.
    Returns: a BeautifulSoup object.
    '''
    return bs4.BeautifulSoup(requests.get(url).text, 'html.parser')

def extract_first(lst): 
    '''
    A helper function to get the first element of each sublist within a list.
    Credit: https://www.geeksforgeeks.org/python-get-first-element-of-each-sublist/
    Returns: a list.
    '''
    return list(list(zip(*lst))[0])
        
"""
    class pageNode:
    '''
    A class object to represent a scraped webpage. Recursively generates children up to the specified
    capacity. Contains list of children pageNode objects within it.
    Inputs:
        dist_to_parent: how many "layers" of children separate this pageNode from the origin.
        max_dist: how many levels of recursion/children generation are permissable.
        base_url: the base URL to use when interpreting relative hyperlinks on the page.
        page_url: the URL which defines this pageNode, i.e. the page being scraped.
        sourceParNum: paragraph number of the source of the URL in the original webpage.
        sourceText: the paragraph text of the source of the URL in the original webpage. 
    '''
    def __init__(self, dist_to_parent, max_dist, base_url, page_url, sourceParNum, sourceText):
        #self.soup = make_soup(page_url)
        #if make_soup(base_url) == None:
        #    print("THIS DATAFRAME IS EMPTY")
        #    base_url = 'https://this-page-intentionally-left-blank.org/'
        self.df = extract_marx_text(page_url, sourceParNum, sourceText)
        self.refs = extract_marx_refs(make_soup(page_url),base_url)
        self.dist_to_parent = dist_to_parent
        self.children = []
        if self.dist_to_parent < max_dist:
            for url_tuple in self.refs:
                self.children.append(pageNode(self.dist_to_parent+1,max_dist,base_url,url_tuple[0],
                                              url_tuple[1],url_tuple[2]))
            for child in self.children:
                self.df = self.df.append(child.df)
"""                
                
def scrape_index(link='https://www.marxists.org/archive/marx/works/date/index.htm',
                 base_url='https://www.marxists.org/archive/marx/works/date/index.htm'):
    soup_obj = make_soup(link)
    contentPTags = soup_obj.body.findAll('p')
    
    index_tuples = []
    #parsDict = {'link' : [], 'paragraph-number' : [], 'paragraph_text' : []}
    for paragraphNum, pTag in enumerate(contentPTags):
        tagLinks = pTag.findAll('a',href=re.compile('htm$'),class_=False) #href=re.compile('htm$')
        #print(tagLinks)
        #print("next p tag")
        for aTag in tagLinks:
            relurl = aTag.get('href')
            linkText = aTag.text
            index_tuples.append((
                urllib.parse.urljoin(base_url, relurl),
                linkText
                ))
            #parsDict['link'].append(urllib.parse.urljoin(base_url, relurl))
            #parsDict['paragraph-number'].append(paragraphNum)
            #parsDict['paragraph_text'].append(linkText)
            
    return index_tuples    #pandas.DataFrame(parsDict)

def scrape_mecw(link='http://hiaw.org/defcon6/works/cw/index.html',
                 base_url='http://hiaw.org/defcon6/works/cw/index.html'):
    soup_obj = make_soup(link)
    contentPTags = soup_obj.body.findAll('p')
    
    index_tuples = []
    #parsDict = {'link' : [], 'paragraph-number' : [], 'paragraph_text' : []}
    for paragraphNum, pTag in enumerate(contentPTags):
        tagLinks = pTag.findAll('a') #,,class_=False,href=re.compile(r'index/.html')
        #print(tagLinks)
        #print("next p tag")
        for aTag in tagLinks:
            relurl = aTag.get('href')
            linkText = aTag.text
            index_tuples.append((
                urllib.parse.urljoin(base_url, relurl),
                linkText
                ))
            #parsDict['link'].append(urllib.parse.urljoin(base_url, relurl))
            #parsDict['paragraph-number'].append(paragraphNum)
            #parsDict['paragraph_text'].append(linkText)
            
    return index_tuples    #pandas.DataFrame(parsDict)

def check_atag(atag):
    relurl = atag.get('href')
    link_text = atag.text
    #print(link_text)
    if "Collected Works" in link_text:
        return False
    elif "HISTORY  ISA  WEAPON" in link_text:
        return False
    elif link_text=="":
        return False
    elif "History Is A Weapon" in link_text:
        return False
    elif "for small screens" in link_text:
        return False
    elif re.search(r"Notes$", link_text):
        return False
    else:
        if relurl==re.compile(r"index.html$"):
            return False
        else:
            return True

def scrape_index(link,source_text):
    base_url=link
    soup_obj = make_soup(link)
    #contentPTags = soup_obj.body.findAll('p')
    
    index_tuples = []
    #forbidden_words = ["Collected Works"]
    #parsDict = {'link' : [], 'paragraph-number' : [], 'paragraph_text' : []}
    #for paragraphNum, pTag in enumerate(contentPTags):
        #tagLinks = pTag.findAll('a') #,,class_=False,href=re.compile(r'index/.html')
        #print(tagLinks)
        #print("next p tag")
    for aTag in soup_obj.body.findAll('a'):
        if check_atag(aTag):
            relurl = aTag.get('href')
            linkText = aTag.text
            index_tuples.append((
                link,
                source_text,
                urllib.parse.urljoin(base_url, relurl),
                linkText
                ))
            #parsDict['link'].append(urllib.parse.urljoin(base_url, relurl))
            #parsDict['paragraph-number'].append(paragraphNum)
            #parsDict['paragraph_text'].append(linkText)
            
    return index_tuples    #pandas.DataFrame(parsDict)

def extract_marx_text(link_bigtuple):
    master_link=link_bigtuple[0]
    master_sourcetext=link_bigtuple[1]
    link=link_bigtuple[2]
    link_text=link_bigtuple[3]
    
    marxContentSoup = make_soup(link)
    parsDict = {'vol_source' : [], 'vol_name': [], 'link_source' : [],  
                'link_name' : [], 'paragraph_text': []}
    
    if marxContentSoup.body is None:
        return pandas.DataFrame(parsDict)
    else:
        contentPTags = marxContentSoup.body.findAll('p')
    
        for parNum, pTag in enumerate(contentPTags):
            if pTag=="":
                continue
            else: 
                #print("Appending real text")
                modA = re.sub(r'[\r\n]',' ',pTag.text) #remove random line breaks
                #modB = re.sub(r'\|p.{1,5}\|','',modA) #remove page number references
                #modC = re.sub(r'\[.+\]','',modB) #remove translator's notes
                #modD = re.sub(r'Karl Marx.','',modC) #remove website header
                #modE = re.sub(r'Next Section |  Table of Contents Marx-Engels Archive','',modD) #removed website footer
                #modF = re.sub(r'\xa0',' ',modE)
        
                parsDict['vol_source'].append(master_link)
                parsDict['vol_name'].append(master_sourcetext)
                parsDict['link_source'].append(link)
                parsDict['link_name'].append(link_text)
                parsDict['paragraph_text'].append(modA)
        
        #print(pandas.DataFrame(parsDict).head(5))
        #print("Testing")
        return pandas.DataFrame(parsDict)

def compile_mecw(i,j):
    mecw_index = scrape_mecw()
    #i = vol + 1
    
    volume_links = []
    for link_tuple in mecw_index[i:j]:
        if not link_tuple[1]=="":
            volume_links.append(scrape_index(link_tuple[0],link_tuple[1]))
    
    parsDict = {'vol_source' : [], 'vol_name': [], 'link_source' : [],  
                'link_name' : [], 'paragraph_text': []}
    master_table = pandas.DataFrame(parsDict)
    
    progress = 0
    dataframe_list = []
    for volume_list in volume_links:
        if len(volume_list) > 0:
            for content_tuple in volume_list: 
                #print(content_tuple)
                dataframe_list.append(extract_marx_text(content_tuple))
            progress += 1
            print("DEBUG: Progress =",progress)
    
    return pandas.concat(dataframe_list)

In [56]:
data = compile_mecw(2,51)
data

DEBUG: Progress = 1
DEBUG: Progress = 2
DEBUG: Progress = 3
DEBUG: Progress = 4
DEBUG: Progress = 5
DEBUG: Progress = 6
DEBUG: Progress = 7
DEBUG: Progress = 8
DEBUG: Progress = 9
DEBUG: Progress = 10
DEBUG: Progress = 11
DEBUG: Progress = 12
DEBUG: Progress = 13
DEBUG: Progress = 14
DEBUG: Progress = 15
DEBUG: Progress = 16
DEBUG: Progress = 17
DEBUG: Progress = 18
DEBUG: Progress = 19
DEBUG: Progress = 20
DEBUG: Progress = 21
DEBUG: Progress = 22
DEBUG: Progress = 23
DEBUG: Progress = 24
DEBUG: Progress = 25
DEBUG: Progress = 26
DEBUG: Progress = 27
DEBUG: Progress = 28
DEBUG: Progress = 29
DEBUG: Progress = 30
DEBUG: Progress = 31
DEBUG: Progress = 32
DEBUG: Progress = 33
DEBUG: Progress = 34
DEBUG: Progress = 35
DEBUG: Progress = 36
DEBUG: Progress = 37
DEBUG: Progress = 38
DEBUG: Progress = 39
DEBUG: Progress = 40
DEBUG: Progress = 41
DEBUG: Progress = 42
DEBUG: Progress = 43
DEBUG: Progress = 44
DEBUG: Progress = 45
DEBUG: Progress = 46
DEBUG: Progress = 47
DEBUG: Progress = 48
D

Unnamed: 0,vol_source,vol_name,link_source,link_name,paragraph_text
0,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Marx Engels Collected Works
1,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,KARL MARX and FREDERICK ENGELS were the author...
2,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Theirs was a unique collaboration in theoretic...
3,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Both Marx and Engels began their adult lives a...
4,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,They were revolutionary thinkers who assailed ...
...,...,...,...,...,...
12,http://hiaw.org/defcon6/works/cw/volume49/inde...,Volume 49,http://hiaw.org/defcon6/works/1892/letters/92_...,Engels to Franz Mehring. 28 September,2. Adam Heinrich M�ller (1779-1829) – German...
13,http://hiaw.org/defcon6/works/cw/volume49/inde...,Volume 49,http://hiaw.org/defcon6/works/1892/letters/92_...,Engels to Franz Mehring. 28 September,3. Joseph Marie de Maistre (1753-1821) – Fre...
14,http://hiaw.org/defcon6/works/cw/volume49/inde...,Volume 49,http://hiaw.org/defcon6/works/1892/letters/92_...,Engels to Franz Mehring. 28 September,"4. Engels refers to Lavergne-Peguilhen, Grundz..."
15,http://hiaw.org/defcon6/works/cw/volume49/inde...,Volume 49,http://hiaw.org/defcon6/works/1892/letters/92_...,Engels to Franz Mehring. 28 September,


In [52]:
pandas.options.display.max_rows = 5000
data[:4999]

Unnamed: 0,vol_source,vol_name,link_source,link_name,paragraph_text
0,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Marx Engels Collected Works
1,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,KARL MARX and FREDERICK ENGELS were the author...
2,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Theirs was a unique collaboration in theoretic...
3,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Both Marx and Engels began their adult lives a...
4,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,They were revolutionary thinkers who assailed ...
5,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Marx and Engels were never merely theoretician...
6,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,The sum total of achievement of Marx and Engel...
7,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,Marxism offers to the revolutionary movement o...
8,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,In the light of this the character and consequ...
9,http://hiaw.org/defcon6/works/cw/volume01/inde...,Volume 1,http://hiaw.org/defcon6/works/cw/volume01/intr...,General Introduction,In their studies of the past history and prese...


In [57]:
filtA = [len(text) > 39 for text in data.paragraph_text]
marx_onefilt = data[filtA].reset_index()
marx_onefilt.to_csv(r'C:\Users\super\comp_work\Homework-Notebooks\votava_project_data\mecw_frag.csv', index_label=False, sep=',')