In [1]:
import pandas as pd
import requests
import tweepy
import numpy as np
from nltk import sent_tokenize
import re
import json


def get_Guten(url):
    # retrieve the source text
    r = requests.get(url)
    r.encoding = 'utf-8'
    text = r.text
    return text

def get_text(path):
    f = open(path, 'r', encoding='utf8')
    text = f.read()
    f.close()
    return text

def clean_text(text):
    with open("data/replace_chars.json") as f:
        replace_these = json.load(f)
        for k in replace_these.keys():
            text = text.replace(k, replace_these[k])
    return text

def make_into_quotes_guten(text, source):
    # make a list of quotes and clean them up
    quotes = sent_tokenize(text)
    # remove unnecessary spaces
    quotes = [x.strip() for x in quotes]
    # remove empty quotes
    quotes = list(filter(None, quotes))
    # cut out very short ones as they often have no real meaning
    quotes = [x for x in quotes if len(x) > 15]
    # remove the titles of sections & citation-type stuff
    quotes = [x for x in quotes if not x.isupper()]
    quotes = [x for x in quotes if not x.replace('the', '').replace('of', '').replace('and', '').replace('II', '').istitle()]
    quotes = [x for x in quotes if not set('Werke').issubset(x)]
    # remove oddities
    quotes = [x for x in quotes if x[0].isupper()]
    quotes = [x.replace('.', '') for x in quotes]
    quotes = [x for x in quotes if not x[-1].isupper()]
    # add the source
    quotes = [x+'\n- '+ source for x in quotes]
    return quotes

def make_into_quotes_pdf(text, source):
    # make the text into a list
    quotes = sent_tokenize(text)
    # remove unnecessary spaces
    quotes = [x.strip() for x in quotes]
    # remove empty quotes
    quotes = list(filter(None, quotes))
    # cut out very short ones as they often have no real meaning
    quotes = [x for x in quotes if len(x) > 15]
    # remove the titles of sections & citation-type stuff
    quotes = [x for x in quotes if not x.isupper()]
    quotes = [x for x in quotes if not x.replace('the', '').replace('of', '').replace('and', '').replace('II', '').istitle()]
    quotes = [x for x in quotes if not set('Werke').issubset(x)]
    # this looks at all quotes and removes headers/footers/page numbers that are sometimes in the text accidentally
    holding = []
    for quote in quotes:
        for word in quote.split(' '):
            if word.isupper() and len(word) > 2 and word != 'A' and word != 'OF':
                quote = quote.replace(word, '')
        quote = re.sub('[1234567890]', '', quote).replace(' s ', ' ').replace(' S ', ' ').replace('OF', '').replace(' ) ', '').replace(' ( ',                   '').replace(' , ', '').replace('- ', '-').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
        holding.append(quote)
    # remove oddities
    quotes = [x for x in holding if x[0].isupper()]
    quotes = [x.replace('.', '') for x in quotes]
    quotes = [x for x in quotes if not x[-1].isupper()]
    quotes = [x for x in quotes if not set('~').issubset(x)]
    quotes = [x for x in quotes if not set('=').issubset(x)]
    # add the source
    quotes = [x+'\n- '+ source for x in quotes]
    return quotes

In [2]:
# # import different texts, cut out their front and end matter
hop1 = clean_text(get_Guten('http://www.gutenberg.org/files/51635/51635-0.txt'))[10545:-34150]
hop2 = clean_text(get_Guten('http://www.gutenberg.org/files/51636/51636-0.txt'))[4489:-42865]
hop3 = clean_text(get_Guten('http://www.gutenberg.org/files/58169/58169-0.txt'))[10068:-125524]
enc_logic = clean_text(get_Guten('http://www.gutenberg.org/files/55108/55108-0.txt'))[36755:-134712]
phil_of_nature = clean_text(get_text('.\data\Phil_of_Nature.txt'))[500403:-278448]

In [3]:
# turn these texts into quotes and assemble a list
hop1_quotes = make_into_quotes_guten(hop1, 'HoP 1')
hop2_quotes = make_into_quotes_guten(hop2, 'HoP 2')
hop3_quotes = make_into_quotes_guten(hop3, 'HoP 3')
enc_logic_quotes = make_into_quotes_guten(enc_logic, 'EnL')
pon_quotes = make_into_quotes_pdf(phil_of_nature, 'PoN')

master_q = hop1_quotes + hop2_quotes + hop3_quotes + enc_logic_quotes + pon_quotes

# preview the quote list to see if there are any abberations
random_range_start = np.random.randint(0, len(master_q))
master_q[random_range_start:random_range_start + 10], len(master_q)

(['Here, instead of surveying the process, as we do in history, from the outside, we see the movement of thought clearly defined in its native medium\n- EnL',
  'The thought, which is genuine and self-supporting, must be intrinsically concrete; it must be an Idea; and when it is viewed in the whole of its universality, it is the Idea, or the Absolute\n- EnL',
  'The science of this Idea must form a system\n- EnL',
  'For the truth is concrete; that is, whilst it gives a bond and principle of unity, it also possesses an internal source of development\n- EnL',
  'Truth, then, is only possible as a universe or totality of thought; and the freedom of the whole, as well as the necessity of the several sub-divisions, which it implies, are only possible when these are discriminated and defined\n- EnL',
  'Unless it is a system, a philosophy is not a scientific production\n- EnL',
  'Unsystematic philosophising can only be expected to give expression to personal peculiarities of mind, and has 

In [4]:
# import the original set of quotes and prepare it for merging
old_quotes = pd.read_csv('.\data\Original_Quote_sheet.csv')
old_quotes = old_quotes.drop('Unnamed: 0', axis=1)
old_quotes = old_quotes.rename(columns={'Select one from each column':'quotes'})
old_quotes = old_quotes.iloc[3:]
old_quotes['quotes'] = old_quotes['quotes'].str.capitalize()

In [5]:
# turn the list into a dataframe and weed out untweetabley-long quotes
quote_df = pd.DataFrame(master_q, columns=['quotes'])
quote_df = old_quotes.append(quote_df)
quote_df['length'] = quote_df['quotes'].str.len()
quote_tweetable = quote_df.loc[quote_df['length'] <= 280].copy()

# preview again, see how many we have
quote_tweetable.iloc[random_range_start:random_range_start + 10], len(quote_tweetable), len(quote_df)

(                                                  quotes  length
 15058  This is correct, if it means that a man's cond...     184
 15060  Self-relation in Essence is the form of Identi...     139
 15061  They are both the same abstraction,--self-rela...      56
 15062  The unintelligence of sense, to take everythin...     203
 15063  This identity, as it has descended from Being,...     174
 15064  This external Being, if taken in separation fr...     109
 15065                But that turns out a mistake\n- EnL      34
 15066  Because Essence is Being-in-self, it is essent...     147
 15067  Consequently, it has the unessential as its ow...      91
 15069  The sphere of Essence thus turns out to be a s...     105,
 15889,
 18714)

In [6]:
# export csv for use by tweeter program
quote_tweetable.to_csv('Quote List.csv')

In [13]:
SoL = get_text('.\data\Science_of_Logic.txt').split('preface to the first edition')[1]
len(SoL)

2180175

In [None]:
sol = get_

In [None]:
sol = get_text()

In [8]:

f = open('.\data\Science_of_Logic.txt', 'r', encoding='utf8')
text = f.read()
f.close()
