# Scraping blog post content

In [27]:
from bs4 import BeautifulSoup
import requests
import urllib.request
import pandas as pd
import csv

Define base url

In [2]:
url_base = "https://thelilacelephant.com/"

Example full URL

In [17]:
post_url = "https://thelilacelephant.com/2011/07/18/the-naming-of-things/"

Create post dictionary

In [4]:
posts_dictionary = {
    "1":{
    "year": "2011",
    "month": "07",
    "day": "13",
    "title": "here-i-am"
    },
    "2":{
    "year": "2011",
    "month": "07",
    "day": "18",
    "title": "the-naming-of-things"
    },
    "3":{
    "year": "2011",
    "month": "10",
    "day": "30",
    "title": "a-few-words"
    },
    "4":{
       "year": "2011",
    "month": "11",
    "day": "05",
    "title": "mirror-mirror" 
    },
    "5":{
       "year": "2011",
    "month": "11",
    "day": "23",
    "title": "home-safe" 
    },
    "6":{
       "year": "2011",
    "month": "12",
    "day": "04",
    "title": "give-thanks" 
    },
    "7":{
       "year": "2011",
    "month": "12",
    "day": "15",
    "title": "family" 
    },
    "8":{
       "year": "2012",
    "month": "01",
    "day": "04",
    "title": "whats-cooking" 
    },
    "9":{
       "year": "2012",
    "month": "02",
    "day": "19",
    "title": "hello-old-friend" 
    },
    "10":{
       "year": "2012",
    "month": "02",
    "day": "23",
    "title": "i-wished-i-could-fly" 
    },
    "11":{
       "year": "2012",
    "month": "03",
    "day": "04",
    "title": "stay" 
    },
    "12":{
       "year": "2012",
    "month": "03",
    "day": "15",
    "title": "shaken" 
    },
    "13":{
       "year": "2012",
    "month": "05",
    "day": "04",
    "title": "april" 
    },
    "14":{
       "year": "2012",
    "month": "05",
    "day": "13",
    "title": "for-george" 
    },
    "15":{
       "year": "2012",
    "month": "09",
    "day": "01",
    "title": "favourite-photographs" 
    },
    "16":{
       "year": "2012",
    "month": "11",
    "day": "16",
    "title": "little-boxes" 
    },
    "17":{
       "year": "2012",
    "month": "12",
    "day": "12",
    "title": "food-love-etc" 
    },
    "18":{
       "year": "2013",
    "month": "01",
    "day": "24",
    "title": "family-recipes" 
    },
    "19":{
       "year": "2013",
    "month": "04",
    "day": "07",
    "title": "something-about-faith" 
    },
    "20":{
       "year": "2013",
    "month": "06",
    "day": "13",
    "title": "where-it-goes" 
    },
    "21":{
       "year": "2013",
    "month": "09",
    "day": "26",
    "title": "more-than-words" 
    },
    "22":{
       "year": "2014",
    "month": "03",
    "day": "05",
    "title": "in-progress" 
    },
    "23":{
       "year": "2014",
    "month": "05",
    "day": "04",
    "title": "what-im-reading" 
    },
    "24":{
       "year": "2014",
    "month": "10",
    "day": "20",
    "title": "the-land-of-stones" 
    },
    "25":{
       "year": "2014",
    "month": "10",
    "day": "28",
    "title": "spark" 
    },
    "26":{
       "year": "2014",
    "month": "11",
    "day": "06",
    "title": "burn" 
    },
    "27":{
       "year": "2014",
    "month": "11",
    "day": "11",
    "title": "washing-dishes" 
    },
    "28":{
       "year": "2015",
    "month": "04",
    "day": "27",
    "title": "tiny-cracks" 
    },
    "29":{
       "year": "2015",
    "month": "07",
    "day": "21",
    "title": "marys-island" 
    },
    "30":{
       "year": "2015",
    "month": "09",
    "day": "21",
    "title": "here" 
    },
    "31":{
       "year": "2016",
    "month": "07",
    "day": "12",
    "title": "knowing" 
    },
    "32":{
       "year": "2016",
    "month": "07",
    "day": "25",
    "title": "run" 
    },
    "33":{
       "year": "2016",
    "month": "10",
    "day": "19",
    "title": "gifts" 
    },
    "34":{
       "year": "2017",
    "month": "05",
    "day": "29",
    "title": "bloom" 
    },
}

In [24]:
def get_posts(page_dictionary):
    post_content = []
    post_titles = []
    image_links = []
    
    i=1
    for key, value in page_dictionary.items():
        # Build url based on dictionary entry
        post_url = url_base + str(value['year']) + "/" + str(value['month']) + "/" + str(value['day']) + "/" + str(value['title']) + "/"
        print("Processing page: " + str(i))
        
        # Get page using requests
        page = requests.get(post_url)
        content = page.content
        
        # Create page soup
        soup = BeautifulSoup(content)
        
        # Find blog post
        bodyarticle = soup.find('article', attrs={'class': 'post'})
        
        # Get post title
        title = bodyarticle.find('h1').text
        post_titles.append(title)
        
        # Get post image
        images = bodyarticle.find('img')
        if images is None:
            image_links.append("")
        elif images.has_attr('data-large-file'):
            image_link = images.attrs['data-large-file']
            image_links.append(image_link)
        else:
            image_link = images.attrs['src']
            image_links.append(image_link)
        
        # Get post body
        post = bodyarticle.find_all('p', attrs={'class': ''})
        post_content.append(post)
        i+=1
    return post_titles, post_content, image_links

In [59]:
post_titles, post_content, image_links = get_posts(posts_dictionary)

Processing page: 1
Processing page: 2
Processing page: 3
Processing page: 4
Processing page: 5
Processing page: 6
Processing page: 7
Processing page: 8
Processing page: 9
Processing page: 10
Processing page: 11
Processing page: 12
Processing page: 13
Processing page: 14
Processing page: 15
Processing page: 16
Processing page: 17
Processing page: 18
Processing page: 19
Processing page: 20
Processing page: 21
Processing page: 22
Processing page: 23
Processing page: 24
Processing page: 25
Processing page: 26
Processing page: 27
Processing page: 28
Processing page: 29
Processing page: 30
Processing page: 31
Processing page: 32
Processing page: 33
Processing page: 34


In [28]:
# Save posts to csv
with open('posts.csv', 'w', newline='') as csvfile:
    wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
    wr.writerow(post_content)

In [29]:
# Save titles to csv
with open('titles.csv', 'w', newline='') as csvfile:
    wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
    wr.writerow(post_titles)

In [30]:
# Save links to csv
with open('links.csv', 'w', newline='') as csvfile:
    wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
    wr.writerow(image_links)

In [31]:
# Save all images locally
for link in image_links:
    if link == "":
        pass
    else:
        image_name = link[link.find("https://abiinafrica.files.wordpress.com")+48:link.find(".jpg")]
        urllib.request.urlretrieve(link, "lilacelephant_images/" + str(image_name) + ".jpg")

In [60]:
def process_post_content(post):
    i=0
    for child in post:
        if str(child.text) == "":
            pass
        elif str(child.text) == "Tags":
            pass
        elif str(child.text) == "Gallery":
            pass
        else:
            post_text.append(str(child.text))
    return post_text

In [70]:
post_text = []
# post_detail = []
# post_tags = []

for post in post_content:
    post_text = process_post_content(post)

In [72]:
# Remove all \xa0 and \n
post_text_processed = []
for post_item in post_text:
    post_item = post_item.replace(u'\xa0', u' ')
    post_item = post_item.replace(u'\n', u'')
    post_text_processed.append(post_item)

In [74]:
# Remove all empty items
post_text_processed_no_space = []
for post_item in post_text_processed:
    if post_item == ' ':
        post_item = None
        post_text_processed_no_space.append(post_item)
    else:
        post_text_processed_no_space.append(post_item)

In [75]:
post_text_processed_no_space = list(filter(None, post_text_processed_no_space))

In [76]:
post_objects = []
post_tags = []
for post_item in post_text_processed_no_space:
    if "\t\t\tPosted" in post_item:
        post_object = []
        post_objects.append(post_object)
        continue
    elif len(post_item.split()) < 4:
        post_tags.append(post_item)
        continue
    else:
        post_object.append(post_item)
        continue

In [81]:
df = pd.DataFrame(post_objects)

In [82]:
posts_as_columns = df.T

In [86]:
# Save links to csv
posts_as_columns.to_csv("post_content_temp.csv", sep=",", columns=)