In [1]:
import pandas as pd
import requests
import tweepy
import numpy as np
import re

bad_char_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9,' '0', '-', ',']

# this function looks at quotes and eliminates opening characters we don't want, returns a list of cleaned quotes
def remove_bad_starts(quotes):
    new_quotes = []
    for quote in quotes:
        if quote[0] in bad_char_list:
            if quote[1] in bad_char_list:
                new_quotes.append(quote[3:].strip())
                continue
            new_quotes.append(quote[2:].strip())
            continue
        new_quotes.append(quote)
    return new_quotes

def get_and_clean_Guten(url):
    # retrieve the source text
    r = requests.get(url)
    text = r.text
    text = text.replace("\r\n", ' ')
    # remove unicode characters and other unnecessaries
    text = text.replace('â\x80\x9d', ' ')
    text = text.replace('â\x80\x94', '-')
    text = text.replace('â\x80\x9c', '')
    text = text.replace('â\x80\x99', '')
    text = text.replace('â\x80\x98', '')
    text = text.replace('\x86', '')
    text = text.replace('Å\x93', '')
    # remove abbreviative periods and other unwanted periods and characters
    text = re.sub(r".(?=[^)(]*\))", "", text)
    text = text.replace('_', '')
    text = text.replace('[', '').replace(']', '')
    text = text.replace('e.g.', 'eg')
    text = text.replace('i.e.', 'ie')
    text = text.replace('etc.', 'etc')
    text = text.replace('&c.', '&c')
    text = text.replace('viz.', 'viz')
    text = text.replace('(', '').replace(')', '')
    return text

def make_into_quotes(text, source):
    # make a list of quotes and clean them up
    quotes = text.split('.')
    # remove unnecessary spaces
    quotes = [x.strip() for x in quotes]
    # remove empty quotes
    quotes = list(filter(None, quotes))
    # cut out very short ones
    quotes = [x for x in quotes if len(x) > 15]
    # remove the titles of sections
    quotes = [x for x in quotes if not x.isupper()]
    quotes = [x for x in quotes if not x.replace('the', '').replace('of', '').replace('and', '').istitle()]
    # remove oddities
    quotes = remove_bad_starts(quotes)
    quotes = [x for x in quotes if x[0].isupper()]
    quotes = [x for x in quotes if x[-1] not in bad_char_list]

    # add the source
    quotes = [x+'. \n- '+source for x in quotes]

    return quotes

In [36]:
# import different texts, cut out their front and end matter
hop1 = get_and_clean_Guten('http://www.gutenberg.org/files/51635/51635-0.txt')[10520:-32300]
hop2 = get_and_clean_Guten('http://www.gutenberg.org/files/51636/51636-0.txt')[4469:-40015]
hop3 = get_and_clean_Guten('http://www.gutenberg.org/files/58169/58169-0.txt')[10056:-119754]

In [37]:
# turn these texts into quotes and assemble a list
hop1_quotes = make_into_quotes(hop1, 'HoP 1')
hop2_quotes = make_into_quotes(hop2, 'HoP 2')
hop3_quotes = make_into_quotes(hop3, 'HoP 3')

master_q = hop1_quotes + hop2_quotes + hop3_quotes
master_q[0:10], len(master_q)


(['Since the History of Philosophy is to be the subject of these lectures, and to-day I am making my first appearance in this University, I hope you will allow me to say what satisfaction it gives me to take my place once more in an Academy of Learning at this particular time. \n - HoP 1',
  'For the period seems to have been arrived at when Philosophy may again hope to receive some attention and love-this almost dead science may again raise its voice, and hope that the world which had become deaf to its teaching, may once more lend it an ear. \n - HoP 1',
  'The necessities of the time have accorded to the petty interests of every-day life such overwhelming attention: the deep interests of actuality and the strife respecting these have engrossed all the powers and the forces of the mind-as also the necessary means-to so great an extent, that no place has been left to the higher inward life, the intellectual operations of a purer sort; and the better natures have thus been stunted in t

In [3]:
# import the existing set of quotes and prepare it for merging
old = pd.read_csv('Original_Quote_sheet.csv')
old = old.drop('Unnamed: 0', axis=1)
old = old.rename(columns={'Select one from each column':'quotes'})
old = old.iloc[3:]
old['quotes'] = old['quotes'].str.capitalize()

Unnamed: 0,quotes
3,Logic did not fare quite so badly as metaphysics
4,Even the proofs of the existence of god are ci...
5,The fact is that there no longer exists any in...
6,[kantian philosophy] was a justification from ...
7,There was seen the strange spectacle of a cult...


In [39]:
# turn the list into a dataframe and weed out untweetabley-long quotes
quote_df = pd.DataFrame(master_q, columns=['quotes'])
quote_df = old.append(quote_df)
quote_df['length'] = quote_df['quotes'].str.len()
quote_tweetable = quote_df.loc[quote_df['length'] <= 240].copy()

quote_tweetable.head(), len(quote_tweetable), len(quote_df)

(                                              quotes  length
 3   Logic did not fare quite so badly as metaphysics    48.0
 4  Even the proofs of the existence of god are ci...   136.0
 5  The fact is that there no longer exists any in...   121.0
 6  [kantian philosophy] was a justification from ...   113.0
 7  There was seen the strange spectacle of a cult...   139.0,
 11704,
 15579)

In [6]:
# export csv for use by tweeter program
quote_tweetable.to_csv('Quote List.csv')

In [25]:

rand = np.random.randint(0, len(quote_df))
rand_quote = quote_df.iloc[rand]['quotes']

'Thus he at first  proceeds from this point to movement , and says that it is essential that a philosophy of nature should speak of it, but that it is difficult to grasp; in fact, it is one of the most difficult conceptions. \n - HoP 2'

In [38]:
len(list(quote_df['quotes'].loc[quote_df['quotes'].str.contains('viz', na=True)]))

40

In [28]:
quote_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15584 entries, 3 to 15184
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   quotes  15583 non-null  object 
 1   length  15583 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1005.2+ KB
