In [1]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
import lucem_illud #pip install git+git://github.com/UChicago-Computational-Content-Analysis/lucem_illud.git

#All these packages need to be installed from pip
import requests #for http requests
import bs4 #called `beautifulsoup4`, an html parser
import pandas #gives us DataFrames
import docx #reading MS doc files, install as `python-docx`

#Stuff for pdfs
#Install as `pdfminer2`
import pdfminer.pdfinterp
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfpage

#These come with Python
import re #for regexs
import urllib.parse #For joining urls
import io #for making http requests look like files
import json #For Tumblr API responses
import os.path #For checking if files exist
import os #For making directories

In [145]:
def make_soup(url):
    '''
    A helper function to simply making a BeautifulSoup object from a function.
    Input: a full URL.
    Returns: a BeautifulSoup object.
    '''
    return bs4.BeautifulSoup(requests.get(url).text, 'html.parser')

def extract_first(lst): 
    '''
    A helper function to get the first element of each sublist within a list.
    Credit: https://www.geeksforgeeks.org/python-get-first-element-of-each-sublist/
    Returns: a list.
    '''
    return list(list(zip(*lst))[0])

def extract_marx_text(target_url, sourceText):
    '''
    Function to return cleaned paragraph bodies from marxists.org webpages.
    Inputs:
        target_url: the url of the webpage to extract from.
        sourceParNum: paragraph number of the source of the URL in the original webpage.
        sourceText: the paragraph text of the source of the URL in the original webpage.
    Returns: a pandas dataframe.
    '''
    marxContentSoup = make_soup(target_url)
    contentPTags = marxContentSoup.body.findAll('p')
    
    parsDict = {'source' : [], 'paragraph_text' : [],  
                'source-paragraph-text' : []}
    for parNum, pTag in enumerate(contentPTags): 
        modA = re.sub(r'[\r\n]',' ',pTag.text) #remove random line breaks
        modB = re.sub(r'\|p.{1,5}\|','',modA) #remove page number references
        modC = re.sub(r'\[.+\]','',modB) #remove translator's notes
        modD = re.sub(r'Karl Marx.','',modC) #remove website header
        modE = re.sub(r'Next Section |  Table of Contents Marx-Engels Archive','',modD) #removed website footer
        modF = re.sub(r'\xa0',' ',modE)
        parsDict['paragraph_text'].append(modF)
        parsDict['source'].append(target_url)
        parsDict['source-paragraph-text'].append(sourceText)

    return pandas.DataFrame(parsDict)

def extract_marx_refs(soup_obj, base_url='https://www.marxists.org/archive/marx/works/date/index.htm'):
    '''
    A function to parse a marxists.org webpage and extract all internal links which point to a page that ends in "htm."
    Inputs:
        soup_obj: a BeautifulSoup object.
        base_url: the base URL to use when interpreting relative hyperlinks on the page.
    Returns: a list of tuples.
    '''
    contentPTags = soup_obj.body.findAll('p')
    otherPAgeURLS = []
    for paragraphNum, pTag in enumerate(contentPTags):
        tagLinks = pTag.findAll('a', href=re.compile('htm$'),class_=False) #href=re.compile('^/.+/')
        #print(tagLinks)
        #print("next p tag")
        for aTag in tagLinks:
            relurl = aTag.get('href')
            linkText = aTag.text
            otherPAgeURLS.append((
                urllib.parse.urljoin(base_url, relurl),
                linkText,
            ))
    return otherPAgeURLS
        
"""
    class pageNode:
    '''
    A class object to represent a scraped webpage. Recursively generates children up to the specified
    capacity. Contains list of children pageNode objects within it.
    Inputs:
        dist_to_parent: how many "layers" of children separate this pageNode from the origin.
        max_dist: how many levels of recursion/children generation are permissable.
        base_url: the base URL to use when interpreting relative hyperlinks on the page.
        page_url: the URL which defines this pageNode, i.e. the page being scraped.
        sourceParNum: paragraph number of the source of the URL in the original webpage.
        sourceText: the paragraph text of the source of the URL in the original webpage. 
    '''
    def __init__(self, dist_to_parent, max_dist, base_url, page_url, sourceParNum, sourceText):
        #self.soup = make_soup(page_url)
        #if make_soup(base_url) == None:
        #    print("THIS DATAFRAME IS EMPTY")
        #    base_url = 'https://this-page-intentionally-left-blank.org/'
        self.df = extract_marx_text(page_url, sourceParNum, sourceText)
        self.refs = extract_marx_refs(make_soup(page_url),base_url)
        self.dist_to_parent = dist_to_parent
        self.children = []
        if self.dist_to_parent < max_dist:
            for url_tuple in self.refs:
                self.children.append(pageNode(self.dist_to_parent+1,max_dist,base_url,url_tuple[0],
                                              url_tuple[1],url_tuple[2]))
            for child in self.children:
                self.df = self.df.append(child.df)
"""                
                
def scrape_index(link='https://www.marxists.org/archive/marx/works/date/index.htm',
                 base_url='https://www.marxists.org/archive/marx/works/date/index.htm'):
    soup_obj = make_soup(link)
    contentPTags = soup_obj.body.findAll('p')
    
    index_tuples = []
    #parsDict = {'link' : [], 'paragraph-number' : [], 'paragraph_text' : []}
    for paragraphNum, pTag in enumerate(contentPTags):
        tagLinks = pTag.findAll('a',href=re.compile('htm$'),class_=False) #href=re.compile('htm$')
        #print(tagLinks)
        #print("next p tag")
        for aTag in tagLinks:
            relurl = aTag.get('href')
            linkText = aTag.text
            index_tuples.append((
                urllib.parse.urljoin(base_url, relurl),
                linkText
                ))
            #parsDict['link'].append(urllib.parse.urljoin(base_url, relurl))
            #parsDict['paragraph-number'].append(paragraphNum)
            #parsDict['paragraph_text'].append(linkText)
            
    return index_tuples    #pandas.DataFrame(parsDict)

In [45]:
#index_link = 'https://www.marxists.org/archive/marx/works/date/index.htm'
#index_soup = make_soup(index_link)
#index_tuples = extract_marx_refs(index_soup,'https://www.marxists.org/archive/marx/')
    

#spidering_node = pageNode(0, 1, index_link, 'https://www.marxists.org/archive/marx/works/', None, None)
#spidering_node.df

pandas.set_option('display.max_colwidth', 200)
pandas.set_option('max_rows', 300)

index_tups = scrape_index() #thru 199

texts = []
counter = 0
for tup in index_tups[:199]:
    new_links = extract_marx_refs(make_soup(tup[0]))
    for link in new_links:  
        new_add = extract_marx_text(link[0],tup[1])
        texts.append(new_add)
    new_add = extract_marx_text(tup[0],tup[1])
    print("Progress:", (counter / 199) * 100,"%")
    counter +=1
    texts.append(new_add)
    
marx_corpus = pandas.concat(texts, ignore_index=True, sort=False)

Progress: 0.0 %
Progress: 0.5025125628140703 %
Progress: 1.0050251256281406 %
Progress: 1.507537688442211 %
Progress: 2.0100502512562812 %
Progress: 2.512562814070352 %
Progress: 3.015075376884422 %
Progress: 3.5175879396984926 %
Progress: 4.0201005025125625 %
Progress: 4.522613065326634 %
Progress: 5.025125628140704 %
Progress: 5.527638190954774 %
Progress: 6.030150753768844 %
Progress: 6.532663316582915 %
Progress: 7.035175879396985 %
Progress: 7.537688442211055 %
Progress: 8.040201005025125 %
Progress: 8.542713567839195 %
Progress: 9.045226130653267 %
Progress: 9.547738693467336 %
Progress: 10.050251256281408 %
Progress: 10.552763819095476 %
Progress: 11.055276381909549 %
Progress: 11.557788944723619 %
Progress: 12.060301507537687 %
Progress: 12.562814070351758 %
Progress: 13.06532663316583 %
Progress: 13.5678391959799 %
Progress: 14.07035175879397 %
Progress: 14.572864321608039 %
Progress: 15.07537688442211 %
Progress: 15.577889447236181 %
Progress: 16.08040201005025 %
Progress: 16

In [49]:
pandas.set_option('max_rows', 300)
marx_corpus
#marx_corpus.to_csv(r'C:\Users\super\comp_work\Homework-Notebooks\week-2\marxcorpus.txt', index=True, sep=',')

In [63]:
len("Marxists Internet Archive: MECW File No Longer Available")

56

In [124]:
filtA = [len(text) > 59 for text in marx_corpus.paragraph_text]
marx_onefilt = marx_corpus[filtA]

filtB = marx_onefilt.paragraph_text.str.contains(r"on an external site or maybe even a link somewhere on this server")
filtC = marx_onefilt.paragraph_text.str.contains(r"Please mention the url of the page where you found the link")
filtD = marx_onefilt.paragraph_text.str.contains(r"Marx Engels Archive Marx Engels Works in Date")
filtE = marx_onefilt.paragraph_text.str.contains(r"publishers of Marx Engels Collected Works, have instructed us")
filtF = marx_onefilt.paragraph_text.str.contains(r"Marxists Internet")
filtG = marx_onefilt.paragraph_text.str.contains(r"access originated from the Marx Engels Collected Works")
filtH = marx_onefilt.paragraph_text.str.contains(r"information about original manuscripts of Marx")
filtI = marx_onefilt.paragraph_text.str.contains("MECW is not the complete")
filtJ = marx_onefilt.paragraph_text.str.contains("You can buy a volume of MECW")
filtK = marx_onefilt.paragraph_text.str.contains("translated by Progress Publishers")
filtL = marx_onefilt.paragraph_text.str.contains(r"View the Marx/Engels Library")
filtM = marx_onefilt.paragraph_text.str.contains(r"Marx-Engels Collected Works")
filtN = marx_onefilt.paragraph_text.str.contains(r" MIA ")
filtO = marx_onefilt.paragraph_text.str.contains(r"See also")
filtP = marx_onefilt.paragraph_text.str.contains(r"See Also")
filtQ = marx_onefilt.paragraph_text.str.contains(" URL ")
filtR = marx_onefilt.paragraph_text.str.contains("webmaster")

sumfilt = ~(filtB | filtC | filtD | filtE | filtF | filtG | filtH | filtI | filtJ | filtK | filtL | filtM | filtN | filtO | filtP | filtQ | filtR)

marx_corpus_filt = marx_onefilt[sumfilt]

In [125]:
marx_corpus_filt[:250]

Unnamed: 0,source,paragraph_text,source-paragraph-text
21,https://www.marxists.org/archive/marx/works/cw/index.htm,"Notes on Ricardo, Marx, 1845 Marx's Mathematical Manuscripts, 1881 Fragments on Literature and Art as well as some of the material in the Biographical section and in the International Workingmen...",The Young Marx
29,https://www.marxists.org/archive/marx/works/1838/index.htm,“that I have now started composing and am working on chorals. But it is terribly difficult. ... I am sending you a specimen. It's the first two lines of Ein' feste Burg ist unser Gott.”,The Young Marx
35,https://www.marxists.org/archive/marx/works/1837-pre/index.htm,1835: Reflections of a young man on the choice of a profession 1836: Jenny 1836: Feelings 1836: My world 1837: Wild Songs 1837: Transformations 1837: Letter to his Father in Trier,The Young Marx
37,https://www.marxists.org/archive/marx/works/1837-pre/index.htm,"“If we have chosen the position in life in which we can most of all work for mankind, no burdens can bow us down, because they are sacrifices for the benefit of all; then we shall experience no pe...",The Young Marx
39,https://www.marxists.org/archive/marx/works/1837-pre/index.htm,1833: To My Grandfather 1836: Poem 1837: The Single Combat of Eteocles and Polynices 1837: Poem 1837: A Pirate Tale,The Young Marx
63,https://www.marxists.org/archive/marx/works/cw/index.htm,"Notes on Ricardo, Marx, 1845 Marx's Mathematical Manuscripts, 1881 Fragments on Literature and Art as well as some of the material in the Biographical section and in the International Workingmen...",The Young Engels
71,https://www.marxists.org/archive/marx/works/1838/index.htm,“that I have now started composing and am working on chorals. But it is terribly difficult. ... I am sending you a specimen. It's the first two lines of Ein' feste Burg ist unser Gott.”,The Young Engels
77,https://www.marxists.org/archive/marx/works/1837-pre/index.htm#engels,1835: Reflections of a young man on the choice of a profession 1836: Jenny 1836: Feelings 1836: My world 1837: Wild Songs 1837: Transformations 1837: Letter to his Father in Trier,The Young Engels
79,https://www.marxists.org/archive/marx/works/1837-pre/index.htm#engels,"“If we have chosen the position in life in which we can most of all work for mankind, no burdens can bow us down, because they are sacrifices for the benefit of all; then we shall experience no pe...",The Young Engels
81,https://www.marxists.org/archive/marx/works/1837-pre/index.htm#engels,1833: To My Grandfather 1836: Poem 1837: The Single Combat of Eteocles and Polynices 1837: Poem 1837: A Pirate Tale,The Young Engels


In [130]:
marx_corpus_filtA = marx_corpus_filt.rename(columns={"source": "source_link", "source-paragraph-text": "source"})

marx_corpus_filtA.to_csv(r'C:\Users\super\comp_work\Homework-Notebooks\week-2\marxcorpus.txt', index=True, sep=',')

In [133]:
print(marx_corpus_filtA.source.unique())

['The Young Marx' 'The Young Engels' 'A Book of Verse' 'Letters'
 'Notebooks on Epicurean Philosophy' 'Telegraph für Deutschland'
 'Articles by Engels' 'Marx’s Doctoral Thesis' 'Anti-Schelling'
 'On Freedom of the Press' 'Rheinische Zeitung'
 "Critique of Hegel's Philosophy of Right" 'The New Moral World'
 'Comment on James Mill' 'Deutsche-Französischer Jahrbücher'
 'Introduction to Critique of Philosophy of Right'
 'Economic & Philosophic Manuscripts' 'On The Jewish Question'
 'Condition of England' 'Condition of Working Class In England'
 'The Holy Family' 'Theses On Feuerbach' 'The German Ideology' 'Saint Max'
 'The Poverty of Philosophy' 'The Communist League'
 'The Principles of Communism' 'Wage Labour & Capital' 'Wages'
 'True Socialism' 'The True Socialists' 'The Northern Star'
 'Deutsche-Brüsseler Zeitung' 'La Réforme' 'On Free Trade' 'On Poland'
 'THE COMMUNIST MANIFESTO' 'Demands of Communist Party'
 'Neue Rheinische Zeitung' 'The June Revolution in Paris'
 'Speeches at Trial

In [148]:
capital_index = scrape_index(link='https://www.marxists.org/archive/marx/works/1867-c1/index.htm',
                             base_url='https://www.marxists.org/archive/marx/works/1867-c1/index.htm') 

texts = []
counter = 0
for tup in capital_index:
    counter +=1
    print(counter, tup[0])
    new_add = extract_marx_text(tup[0],tup[1])
    texts.append(new_add)
    
capital_corpus = pandas.concat(texts, ignore_index=True, sort=False)
capital_corpus

1 https://www.marxists.org/archive/marx/works/1867-c1/part0.htm
2 https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm
3 https://www.marxists.org/archive/marx/works/1867-c1/commodity.htm
4 https://www.marxists.org/archive/marx/works/1867-c1/ch02.htm
5 https://www.marxists.org/archive/marx/works/1867-c1/ch03.htm
6 https://www.marxists.org/archive/marx/works/1867-c1/ch04.htm
7 https://www.marxists.org/archive/marx/works/1867-c1/ch05.htm
8 https://www.marxists.org/archive/marx/works/1867-c1/ch06.htm
9 https://www.marxists.org/archive/marx/works/1867-c1/ch07.htm
10 https://www.marxists.org/archive/marx/works/1867-c1/ch08.htm
11 https://www.marxists.org/archive/marx/works/1867-c1/ch09.htm
12 https://www.marxists.org/archive/marx/works/1867-c1/ch10.htm
13 https://www.marxists.org/archive/marx/works/1867-c1/ch11.htm
14 https://www.marxists.org/archive/marx/works/1867-c1/ch12.htm
15 https://www.marxists.org/archive/marx/works/1867-c1/ch13.htm
16 https://www.marxists.org/archive/marx/wo

Unnamed: 0,source,paragraph_text,source-paragraph-text
0,https://www.marxists.org/archive/marx/works/1867-c1/part0.htm,apital Volume One,Prefaces and\r\nAfterwords
1,https://www.marxists.org/archive/marx/works/1867-c1/part0.htm,1867: Dedication to Wilhelm Wolff 1867: Preface to the First German Edition (Marx) 1872: Preface to the French Edition (Marx) 1873: Afterword to the Second German Edition (Marx) 1875: Afterword ...,Prefaces and\r\nAfterwords
2,https://www.marxists.org/archive/marx/works/1867-c1/part0.htm,Capital Volume One- Index,Prefaces and\r\nAfterwords
3,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,Capital Volume One,Commodities
4,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,Contents,Commodities
...,...,...,...
4051,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"In the Spring and Summer of 1868, Engels studied Capital, writing a Synopsis. Engels wrote a review for The Westminster Review in late May, but it was never published.",Letters on Capital
4052,https://www.marxists.org/archive/marx/letters/subject/capital.htm,Marx's Economic Works | Letters Index Political Economists | Marx-Engels Archive,Letters on Capital
4053,https://www.marxists.org/archive/marx/letters/subject/capital.htm,,Letters on Capital
4054,https://www.marxists.org/archive/marx/index.htm,MIA: M.I.A. Library: Marx & Engels,Marx-Engels Archive


In [149]:
capital_corpus[50:100]

Unnamed: 0,source,paragraph_text,source-paragraph-text
50,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"Coats and linen, however, are not merely values, but values of definite magnitude, and according to our assumption, the coat is worth twice as much as the ten yards of linen. Whence this differe...",Commodities
51,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"While, therefore, with reference to use value, the labour contained in a commodity counts only qualitatively, with reference to value it counts only quantitatively, and must first be reduced to ...",Commodities
52,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"If the productive power of all the different sorts of useful labour required for the production of a coat remains unchanged, the sum of the values of the coats produced increases with their numb...",Commodities
53,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"An increase in the quantity of use values is an increase of material wealth. With two coats two men can be clothed, with one coat only one man. Nevertheless, an increased quantity of material we...",Commodities
54,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"On the one hand all labour is, speaking physiologically, an expenditure of human labour power, and in its character of identical abstract human labour, it creates and forms the value of commodit...",Commodities
55,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,,Commodities
56,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,,Commodities
57,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"Commodities come into the world in the shape of use values, articles, or goods, such as iron, linen, corn, &c. This is their plain, homely, bodily form. They are, however, commodities, only beca...",Commodities
58,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"The reality of the value of commodities differs in this respect from Dame Quickly, that we don’t know “where to have it.” The value of commodities is the very opposite of the coarse materiality ...",Commodities
59,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,"Every one knows, if he knows nothing else, that commodities have a value form common to them all, and presenting a marked contrast with the varied bodily forms of their use values. I mean their ...",Commodities


In [153]:
filtA = [len(text) > 10 for text in capital_corpus.paragraph_text]
marx_onefilt = capital_corpus[filtA]

filtB = marx_onefilt.paragraph_text.str.contains(r"on an external site or maybe even a link somewhere on this server")
filtC = marx_onefilt.paragraph_text.str.contains(r"Please mention the url of the page where you found the link")
filtD = marx_onefilt.paragraph_text.str.contains(r"Marx Engels Archive Marx Engels Works in Date")
filtE = marx_onefilt.paragraph_text.str.contains(r"publishers of Marx Engels Collected Works, have instructed us")
filtF = marx_onefilt.paragraph_text.str.contains(r"Marxists Internet")
filtG = marx_onefilt.paragraph_text.str.contains(r"access originated from the Marx Engels Collected Works")
filtH = marx_onefilt.paragraph_text.str.contains(r"information about original manuscripts of Marx")
filtI = marx_onefilt.paragraph_text.str.contains("Volume One")
filtJ = marx_onefilt.paragraph_text.str.contains("You can buy a volume of MECW")
filtK = marx_onefilt.paragraph_text.str.contains("translated by Progress Publishers")
filtL = marx_onefilt.paragraph_text.str.contains(r"View the Marx/Engels Library")
filtM = marx_onefilt.paragraph_text.str.contains(r"Marx-Engels Collected Works")
filtN = marx_onefilt.paragraph_text.str.contains(r" MIA ")
filtO = marx_onefilt.paragraph_text.str.contains(r"See also")
filtP = marx_onefilt.paragraph_text.str.contains(r"See Also")
filtQ = marx_onefilt.paragraph_text.str.contains(" URL ")
filtR = marx_onefilt.paragraph_text.str.contains("webmaster")

sumfilt = ~(filtB | filtC | filtD | filtE | filtF | filtG | filtH | filtI | filtJ | filtK | filtL | filtM | filtN | filtO | filtP | filtQ | filtR)

capital_corpus_filt = marx_onefilt[sumfilt]
capital_corpus_filt

Unnamed: 0,source,paragraph_text,source-paragraph-text
4,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,Contents,Commodities
5,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,Section 1 - The Two Factors of a Commodity: Use-Value and Value Section 2 - The two-fold Character of the Labour Embodied in Commodities Section 3 - The Form of Value or Exchange-Value,Commodities
6,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,A. Elementary or Accidental Form of Value,Commodities
7,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,1. The Two Poles of the Expression of Value: Relative Form and Equivalent Form 2. The Relative Form of Value,Commodities
8,https://www.marxists.org/archive/marx/works/1867-c1/ch01.htm,a. The Nature and Import of this Form b. Quantitative Determination of Relative Value,Commodities
...,...,...,...
4049,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"In 1864, the founding of the International Workingmen's Association not only drew Marx back out of his study into active involvement in the workers' movement, but also added urgency to the product...",Letters on Capital
4050,https://www.marxists.org/archive/marx/letters/subject/capital.htm,The German edition of Volume I of Capital was published in July 1867.,Letters on Capital
4051,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"In the Spring and Summer of 1868, Engels studied Capital, writing a Synopsis. Engels wrote a review for The Westminster Review in late May, but it was never published.",Letters on Capital
4052,https://www.marxists.org/archive/marx/letters/subject/capital.htm,Marx's Economic Works | Letters Index Political Economists | Marx-Engels Archive,Letters on Capital


In [154]:
#Do before 4038
capital_corpus_filt[capital_corpus_filt['source-paragraph-text'] == 'Letters on Capital']

Unnamed: 0,source,paragraph_text,source-paragraph-text
4039,https://www.marxists.org/archive/marx/letters/subject/capital.htm,Marx & Engels Letters,Letters on Capital
4041,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"Marx's Poverty of Philosophy, published in 1847 was a systematic demolition of the influential French anarchist theorist, Proudhon, which summarises the development of Marx's thinking on politica...",Letters on Capital
4042,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"After the triumph of reaction in Europe, Marx arrives in London on August 26 1849, where, for more than a decade he will be more or less isolated from active struggle, and able to devote time to s...",Letters on Capital
4043,https://www.marxists.org/archive/marx/letters/subject/capital.htm,Between 1850 and 1853 Marx filled 24 notebooks with excerpts from his reading of political economy.,Letters on Capital
4044,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"In the summer of 1857, Marx twice embarked on an exposition of his economic theories, with drafts on Bastiat and Carey and a draft, unfinished general Introduction. (See Volume 28 of the MECW). Ma...",Letters on Capital
4045,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"Between August and October 1858, Marx wrote a Second Draft of the Contribution to the Critique of Political Economy.",Letters on Capital
4046,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"By the end of 1858, Marx had completed his commentary and notes which constitute the unpublished Grundrisse. (See Volume 29 of the MECW). In January 1859, Marx composed the short Preface to the Co...",Letters on Capital
4047,https://www.marxists.org/archive/marx/letters/subject/capital.htm,The Contribution to the Critique of Political Economy was published in Berlin in June 1859.,Letters on Capital
4048,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"Between August 1861 and July 1863, Marx filled 23 notebooks with drafts of what was to become Capital, including Theories of Surplus Value. (See Volumes 30 to 33 of the MECW).",Letters on Capital
4049,https://www.marxists.org/archive/marx/letters/subject/capital.htm,"In 1864, the founding of the International Workingmen's Association not only drew Marx back out of his study into active involvement in the workers' movement, but also added urgency to the product...",Letters on Capital


In [155]:
capital_corpus_filt[:4038].to_csv(r'C:\Users\super\comp_work\Homework-Notebooks\week-2\capital_corpus.txt', index=True, sep=',')