# Text Extractor for Satoshi Nakamoto Posts

In [1]:
import requests
from bs4 import BeautifulSoup

### Extract links from main site

In [2]:
# Use requests to grab website raw content
posts_rawlist = requests.get('http://satoshi.nakamotoinstitute.org/posts/')

In [3]:
# Use BeautifulSoup to parse/format the request information
soup = BeautifulSoup((posts_rawlist.text), "html5lib")

In [4]:
# identify the div containing the 'li' with the post links
list_items = soup.find(class_='col-sm-6 col-sm-offset-3 col-md-6 col-md-offset-3 col-lg-6 col-lg-offset-3')

In [5]:
# find all the links in the above div
link_items = list_items.find_all('a')

In [6]:
# create a list for holding our links list
links = []

# extract the href content, add the root URL, and place the post URL into our 'links' list
for link in link_items:
    if link.has_attr('href'):
        links.append("http://satoshi.nakamotoinstitute.org" + link['href'])

In [7]:
# test links to make sure we have our URLs properly defined
links[:7]

['http://satoshi.nakamotoinstitute.org/posts/p2pfoundation/',
 'http://satoshi.nakamotoinstitute.org/posts/bitcointalk/',
 'http://satoshi.nakamotoinstitute.org/posts/p2pfoundation/1/',
 'http://satoshi.nakamotoinstitute.org/posts/p2pfoundation/2/',
 'http://satoshi.nakamotoinstitute.org/posts/p2pfoundation/3/',
 'http://satoshi.nakamotoinstitute.org/posts/bitcointalk/5/',
 'http://satoshi.nakamotoinstitute.org/posts/bitcointalk/6/']

### Crawl the links for text

In [8]:
# define crawler function
def crawler(links):
    "pull in links list object, scrape text, create txt files"
    # pull in links list
    links = links
    # set file name counter
    file_count = 1
    # interate through links
    for link in links:
        # repeat the process above for collecting the list of links
        # this time we're gathering the body of text from each page
        post_raw = requests.get(link)
        post_soup = BeautifulSoup((post_raw.text), "html5lib")
        text_body = post_soup.find(class_='col-sm-6 col-sm-offset-3 col-md-6 col-md-offset-3 col-lg-6 col-lg-offset-3')
        text = text_body.find_all('div')
        post_content = text[1]
        # create an output file, write the content of a link to the file, close the file, increase the file count for the next link
        output_file = open('posts/post_%s.txt' % str(file_count), 'x')
        output_file.write(post_content.get_text())
        output_file.close()
        file_count += 1
    

In [9]:
# The dev containing the links collected above have preceeding and proceeding meta links that we don't need to scrape
# setting the list index below excludes those links
links_no_ends = links[2:545]
len(links_no_ends)

543

In [10]:
# run the crawler to generate the documents (should be about 543)
crawler(links_no_ends)