In [1]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
import lucem_illud #just in case, regularly update your lucem_illud with the following code: pip install git+git://github.com/UChicago-Computational-Content-Analysis/lucem_illud.git

#All these packages need to be installed from pip
import requests #for http requests
import pandas #gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import numpy as np #For divergences/distances
import scipy #For divergences/distances
import seaborn as sns #makes our plots look nicer
import sklearn.manifold #For a manifold plot
import json #For API responses
import urllib.parse #For joining urls

# comp-linguistics
import spacy

#Displays the graphs
import graphviz #You also need to install the command line graphviz

#These are from the standard library
import os.path
import zipfile
import subprocess
import io
import tempfile

import wordcloud #Makes word clouds

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook
%matplotlib inline

In [2]:
import nltk
nlp = spacy.load('en_core_web_sm')

def word_tokenize(word_list):
    tokenized = []
    # pass word list through language model.
    doc = nlp(word_list)
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

def wordCounter(wordLst):
    wordCounts = {}
    for word in wordLst:
        #We usually need to normalize the case
        wLower = word.lower()
        if wLower in wordCounts:
            wordCounts[wLower] += 1
        else:
            wordCounts[wLower] = 1
    #convert to DataFrame
    countsForFrame = {'word' : [], 'count' : []}
    for w, c in wordCounts.items():
        countsForFrame['word'].append(w)
        countsForFrame['count'].append(c)
    return pandas.DataFrame(countsForFrame)

In [3]:
capital_corpus = pandas.read_csv(r'C:\Users\super\comp_work\Homework-Notebooks\week-2\capital_corpus.txt')
capital_corpus['tokenized_text'] = capital_corpus['paragraph_text'].apply(lambda x: word_tokenize(x))

In [4]:
capital_tokens = []
for text_list in capital_corpus.tokenized_text:
    capital_tokens.extend(text_list)

In [5]:
capital_text = nltk.Text(capital_tokens)

capital_index = nltk.text.ConcordanceIndex(capital_text) 
capital_index.print_concordance('commodity')

Displaying 25 of 917 matches:
ommodities its unit being a single commodity Our investigation must therefore b
efore begin with the analysis of a commodity A commodity is in the first place 
with the analysis of a commodity A commodity is in the first place an object ou
tter a little more closely A given commodity e.g. a quarter of wheat is exchang
e valid exchange values of a given commodity express something equal secondly e
might think that if the value of a commodity is determined by the quantity of l
bourer the more valuable would his commodity be because more time would be requ
far as it requires for producing a commodity no more time than is needed on an 
 for its production The value of a commodity would therefore remain constant if
reater is its value The value of a commodity therefore varies directly as the q
ct of human labour without being a commodity Whoever directly satisfies his wan
en produced for others To become a commodity a product must be transferred to a
 creates n

In [6]:
def sent_tokenize(word_list):
    doc = nlp(word_list)
    sentences = [sent.string.strip() for sent in doc.sents]
    return sentences

def tag_sents_pos(sentences):
    """
    function which replicates NLTK pos tagging on sentences.
    """
    new_sents = []
    for sentence in sentences:
        new_sent = ' '.join(sentence)
        new_sents.append(new_sent)
    final_string = ' '.join(new_sents)
    doc = nlp(final_string)
    
    pos_sents = []
    for sent in doc.sents:
        pos_sent = []
        for token in sent:
            pos_sent.append((token.text, token.tag_))
        pos_sents.append(pos_sent)
    
    return pos_sents

In [7]:
capital_corpus['sentences'] = capital_corpus['paragraph_text'].apply(lambda x: [word_tokenize(s) for s in sent_tokenize(x)])
capital_corpus['POS_sents'] = capital_corpus['sentences'].apply(lambda x: tag_sents_pos(x))

In [8]:
capital_corpus

Unnamed: 0.1,Unnamed: 0,source,paragraph_text,source-paragraph-text,tokenized_text,sentences,POS_sents
0,4,https://www.marxists.org/archive/marx/works/18...,Contents,Commodities,[Contents],[[Contents]],"[[(Contents, NNS)]]"
1,5,https://www.marxists.org/archive/marx/works/18...,Section 1 - The Two Factors of a Commodity: ...,Commodities,"[Section, 1, The, Two, Factors, of, a, Commodi...","[[Section, 1], [The, Two, Factors, of, a, Comm...","[[(Section, NN), (1, CD)], [(The, DT), (Two, C..."
2,6,https://www.marxists.org/archive/marx/works/18...,A. Elementary or Accidental Form of Value,Commodities,"[A., Elementary, or, Accidental, Form, of, Value]","[[A., Elementary, or, Accidental, Form, of, Va...","[[(A., NNP), (Elementary, NNP), (or, CC), (Acc..."
3,7,https://www.marxists.org/archive/marx/works/18...,1. The Two Poles of the Expression of Value:...,Commodities,"[1, The, Two, Poles, of, the, Expression, of, ...","[[1], [The, Two, Poles, of, the, Expression, o...","[[(1, CD)], [(The, DT), (Two, CD), (Poles, NNP..."
4,8,https://www.marxists.org/archive/marx/works/18...,a. The Nature and Import of this Form b. Q...,Commodities,"[a., The, Nature, and, Import, of, this, Form,...","[[a.], [The, Nature, and, Import, of, this, Fo...","[[(a., NN)], [(The, DT), (Nature, NNP), (and, ..."
...,...,...,...,...,...,...,...
3336,4049,https://www.marxists.org/archive/marx/letters/...,"In 1864, the founding of the International Wor...",Letters on Capital,"[In, 1864, the, founding, of, the, Internation...","[[In, 1864, the, founding, of, the, Internatio...","[[(In, IN), (1864, CD), (the, DT), (founding, ..."
3337,4050,https://www.marxists.org/archive/marx/letters/...,The German edition of Volume I of Capital was ...,Letters on Capital,"[The, German, edition, of, Volume, I, of, Capi...","[[The, German, edition, of, Volume, I, of, Cap...","[[(The, DT), (German, JJ), (edition, NN), (of,..."
3338,4051,https://www.marxists.org/archive/marx/letters/...,"In the Spring and Summer of 1868, Engels studi...",Letters on Capital,"[In, the, Spring, and, Summer, of, 1868, Engel...","[[In, the, Spring, and, Summer, of, 1868, Enge...","[[(In, IN), (the, DT), (Spring, NNP), (and, CC..."
3339,4052,https://www.marxists.org/archive/marx/letters/...,Marx's Economic Works | Letters Index Politi...,Letters on Capital,"[Marx, 's, Economic, Works, |, Letters, Index,...","[[Marx, 's, Economic, Works, |, Letters, Index...","[[(Marx, NNP), ('s, POS), (Economic, NNP), (Wo..."


In [9]:
capital_corpus.to_csv(r'C:\Users\super\comp_work\Homework-Notebooks\week-2\capital_corpus_sents.txt', index=True, sep=',')

In [49]:
def find_sents(corpus, term):
    term_list = []
    for sentence_list in corpus.sentences:
        for sentence in sentence_list:
            if term in sentence:
                term_list.append(sentence)
            else:
                continue
                
    return term_list

def join_tokens(sent_list):
    sent_list_plain = []
    for sentence in sent_list:
        sentence_plain = " "
        #for word in sentence:
        #    sentence_plain.append(word + " ")
        sent_list_plain.append(sentence_plain.join(sentence))
        
    return sent_list_plain

def do_both(corpus, term):
    sents = find_sents(corpus, term)
    return join_tokens(sents)

def sample_table(corpus, term_list):
    capital_dict = {'Term': [],'Count': [],'Sents': []}
    for term in term_list:
        capital_dict['Term'].append(term)
        sents = do_both(corpus, term)
        capital_dict['Count'].append(len(sents))
        capital_dict['Sents'].append(sents)
    return pandas.DataFrame(capital_dict)

In [31]:
# labour -
# value -
# commodity -
# revolution -
# proletariat
# socialism
# capitalism

commodity_sents_raw = find_sents(capital_corpus[:100], "commodity")
commodity_sents = join_tokens(commodity_sents_raw)

In [50]:
terms = ['labour','value', 'commodity','revolution','capital']

test_table = sample_table(capital_corpus,terms)
print(test_table)

         Term  Count                                              Sents
0      labour   2357  [If then we leave out of consideration the use...
1       value   2040  [The utility of a thing makes it a use value, ...
2   commodity    723  [The wealth of those societies in which the ca...
3  revolution     51  [It dates from the last third of the 17th cent...
4     capital    862  [And modern economy which looks down with such...


In [26]:
import random

In [33]:
def sample(row_list,num):
    sample = []
    for row in row_list:
        sample.append(random.sample(row,num))
    return sample

2040

In [42]:
sample_table = sample(test_table.Sents[:4],5)
print(sample_table)

[['The foundation of every division of labour that is well developed and brought about by the exchange of commodities is the separation between town and country', 'The cause of profit is that labour produces more than is required for its support', 'A definite concrete labour like the labour of tailoring can only possess the form of equality with the labour of a different type contained in a commodity of a different kind for example the linen insofar as its definite form counts as the expression of something which really constitutes the equality of labours of different sorts or what is equal in those labours', 'They furnish to the capitalist an exact measure for the intensity of labour', 'Where reference is made to labour as a measure of value it necessarily implies labour of one particular kind'], ['All commodities by mirroring themselves in one and the same commodity as quantities of value reflect themselves reciprocally as quantities of value', 'He then did what is done by every purc

In [56]:
random.sample(test_table.Sents[4],5)

['But in the flood of production all the capital originally advanced becomes a vanishing quantity magnitudo evanescens in the mathematical sense compared with the directly accumulated capital i.e. with the surplus value or surplus product that is reconverted into capital whether it functions in the hands of its accumulator or in those of others',
 'The change of value that occurs in the case of money intended to be converted into capital can not take place in the money itself since in its function of means of purchase and of payment',
 'The great beauty of capitalist production consists in this that it not only constantly reproduces the wage worker as wage worker but produces always in proportion to the accumulation of capital a relative surplus population of wage workers',
 'The part played in our days by the direct robbery from the labourer ’s necessary consumption fund in the formation of surplus value and therefore of the accumulation fund of capital the so called domestic industry