In [None]:
import pandas as pd
import requests
import tweepy
import numpy as np
from nltk import sent_tokenize
import re

bad_char_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9,' '0']

def get_Guten(url):
    # retrieve the source text
    r = requests.get(url)
    r.encoding = 'utf-8'
    text = r.text
    return text

def get_text(path):
    f = open('Phil_of_Nature.txt', 'r')
    text = f.read()
    f.close()
    return text

def clean_text(text):
    # remove line break notation
    text = text.replace("\r\n", ' ')
    text = text.replace('\n', ' ')
    # remove abbreviative periods and other unwanted periods and characters
    # this helps reduce character count so we get more tweetable quotes
    text = text.replace('_', '')
    text = text.replace('[', '').replace(']', '')
    text = text.replace('e.g.', 'eg')
    text = text.replace('i.e.', 'ie')
    text = text.replace('etc.', 'etc')
    text = text.replace('&c.', '&c')
    text = text.replace('viz.', 'viz')
    text = text.replace('+', '')
    text = text.replace('Â£', 'f')
    text = text.replace('\x0c', '')
    text = text.replace('Â§', '')
    return text

def make_into_quotes_guten(text, source):
    # make a list of quotes and clean them up
    quotes = sent_tokenize(text)
    # remove unnecessary spaces
    quotes = [x.strip() for x in quotes]
    # remove empty quotes
    quotes = list(filter(None, quotes))
    # cut out very short ones as they often have no real meaning
    quotes = [x for x in quotes if len(x) > 15]
    # remove the titles of sections & citation-type stuff
    quotes = [x for x in quotes if not x.isupper()]
    quotes = [x for x in quotes if not x.replace('the', '').replace('of', '').replace('and', '').replace('II', '').istitle()]
    quotes = [x for x in quotes if not set('Werke').issubset(x)]
    # remove oddities
    quotes = [x for x in quotes if x[0].isupper()]
    quotes = [x.replace('.', '') for x in quotes]
    quotes = [x for x in quotes if not x[-1].isupper()]
    # add the source
    quotes = [x+'\n- '+ source for x in quotes]
    return quotes

def make_into_quotes_pdf(text, source):
    # make the text into a list
    quotes = sent_tokenize(text)
    # remove unnecessary spaces
    quotes = [x.strip() for x in quotes]
    # remove empty quotes
    quotes = list(filter(None, quotes))
    # cut out very short ones as they often have no real meaning
    quotes = [x for x in quotes if len(x) > 15]
    # remove the titles of sections & citation-type stuff
    quotes = [x for x in quotes if not x.isupper()]
    quotes = [x for x in quotes if not x.replace('the', '').replace('of', '').replace('and', '').replace('II', '').istitle()]
    quotes = [x for x in quotes if not set('Werke').issubset(x)]
    # this looks at all quotes and removes headers/footers/page numbers that are sometimes in the text accidentally
    holding = []
    for quote in quotes:
        for word in quote.split(' '):
            if word.isupper() and len(word) > 2 and word != 'A' and word != 'OF':
                quote = quote.replace(word, '')
        quote = re.sub('[1234567890]', '', quote).replace(' s ', ' ').replace(' S ', ' ').replace('OF', '').replace(' ) ', '').replace(' ( ', '').replace(' , ', '').replace('  ', ' ').replace('- ', '-').replace('  ', ' ').replace('  ', ' ')
        holding.append(quote)
    # remove oddities
    quotes = [x for x in holding if x[0].isupper()]
    quotes = [x.replace('.', '') for x in quotes]
    quotes = [x for x in quotes if not x[-1].isupper()]
    quotes = [x for x in quotes if not set('~').issubset(x)]
    quotes = [x for x in quotes if not set('=').issubset(x)]
    # add the source
    quotes = [x+'\n- '+ source for x in quotes]
    return quotes

In [173]:
# import different texts, cut out their front and end matter
hop1 = clean_text(get_Guten('http://www.gutenberg.org/files/51635/51635-0.txt'))[10545:-34150]
hop2 = clean_text(get_Guten('http://www.gutenberg.org/files/51636/51636-0.txt'))[4489:-42865]
hop3 = clean_text(get_Guten('http://www.gutenberg.org/files/58169/58169-0.txt'))[10068:-125524]
enc_logic = clean_text(get_Guten('http://www.gutenberg.org/files/55108/55108-0.txt'))[36755:-134712]
phil_of_nature = clean_text(get_text('Phil_of_Nature.txt'))[499657:-278190]

In [179]:
# turn these texts into quotes and assemble a list
hop1_quotes = make_into_quotes_guten(hop1, 'HoP 1')
hop2_quotes = make_into_quotes_guten(hop2, 'HoP 2')
hop3_quotes = make_into_quotes_guten(hop3, 'HoP 3')
enc_logic_quotes = make_into_quotes_guten(enc_logic, 'EnL')
pon_quotes = make_into_quotes_pdf(phil_of_nature, 'PoN')

master_q = hop1_quotes + hop2_quotes + hop3_quotes + enc_logic_quotes + pon_quotes

# preview the quote list to see if there are any abberations
random_range_start = np.random.randint(0, len(master_q))
master_q[random_range_start:random_range_start + 10], len(master_q)

(['If I say I am for myself, I not only am, but I negate in me all else, exclude it from me, in so far as it seems to me to be external\n- HoP 1',
  'As negation of other being, which is just negation in relation to me, being-for-self is the negation of negation and thus affirmation; and this is, as I call it, absolute negativity in which mediation indeed is present, but a mediation which is just as really taken away\n- HoP 1',
  'The principle of the One is altogether ideal and belongs entirely to thought, even though we wish to say that atoms exist\n- HoP 1',
  'The atom may be taken materially, but it is supersensuous, purely intellectual\n- HoP 1',
  'In our times, too, more especially through the instrumentality of Gassendi, this conception of atoms has been renewed\n- HoP 1',
  'The atoms of Leucippus are, however, not molecules, the small particles of Physics\n- HoP 1',
  'In Leucippus, according to Aristotle, (De gen et corr\n- HoP 1',
  'The One can neither be seen nor shown w

In [181]:
# import the original set of quotes and prepare it for merging
old_quotes = pd.read_csv('Original_Quote_sheet.csv')
old_quotes = old_quotes.drop('Unnamed: 0', axis=1)
old_quotes = old_quotes.rename(columns={'Select one from each column':'quotes'})
old_quotes = old_quotes.iloc[3:]
old_quotes['quotes'] = old_quotes['quotes'].str.capitalize()

In [182]:
# turn the list into a dataframe and weed out untweetabley-long quotes
quote_df = pd.DataFrame(master_q, columns=['quotes'])
quote_df = old_quotes.append(quote_df)
quote_df['length'] = quote_df['quotes'].str.len()
quote_tweetable = quote_df.loc[quote_df['length'] <= 240].copy()

# preview again, see how many we have
quote_tweetable.iloc[random_range_start:random_range_start + 10], len(quote_tweetable), len(quote_df)

(                                                 quotes  length
 3157  If I consider the animal merely as externally ...   187.0
 3158  But it is just as one-sided to say that the pl...   173.0
 3160  Thus, for the first time, we reach the determi...   111.0
 3161  These explanations are necessary here, since h...   208.0
 3162  This is the meaning which is present when we r...   123.0
 3165  So far, the ancients really got: it does not s...    61.0
 3167  With this discovery of thought we conclude the...   101.0
 3168  The profit to be derived from the first period...    72.0
 3169  Some, indeed, think that there is still some s...   160.0
 3170  Thought here has but few determinations—water,...   224.0,
 14263,
 18719)

In [177]:
# export csv for use by tweeter program
quote_tweetable.to_csv('Quote List.csv')