In [5]:
import facebook
import json
import pprint
import csv
from datetime import datetime, timedelta
import calendar
from tqdm import *

# Helper Functions

In [6]:
# Format datetime from raw date to python datetime object, convert from UTC to UTC+8 (Local time)
def format_datetime(raw_datetime):
    newdatetime = datetime.strptime(raw_datetime, "%Y-%m-%dT%H:%M:%S+0000")
    
    timestamp = calendar.timegm(newdatetime.timetuple())
    local_dt = datetime.fromtimestamp(timestamp)
    assert newdatetime.resolution >= timedelta(microseconds=1)
    return local_dt.replace(microsecond=newdatetime.microsecond)


# Main Code

In [7]:
# Facebook access token hardcoded. Get this from Facebook Graph Explorer
ACCESS_TOKEN = ""

# Earliest create_date desired. Posts that go back to this date will be scraped.
END_DATE = datetime.strptime("2014-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")

In [8]:
# Making API call to get all posts for given ID (in this case, politician's page)
graph = facebook.GraphAPI(access_token=ACCESS_TOKEN, version='2.7')

# Edit"id" below to specify page to scrape
#all_posts = graph.get_object(id='leehsienloong/posts?limit=100')
all_posts = graph.get_object(id='k.shanmugam.page/posts?limit=100')

In [9]:
# Creating a list of post IDs from the JSON data (returned from API call)  
post_id_list= []

for post in all_posts["data"]:
    post_id_list.append(post["id"])
    
cur_date = format_datetime(all_posts["data"][99]["created_time"])

In [10]:
# Each API call returns a max of 100 posts. Iterate through "next page" link at end of every API call results page. 
# Stop when desired END DATE reached.

while cur_date > END_DATE:
    raw_next_page_url = all_posts["paging"]["next"]
    next_page_url = raw_next_page_url[raw_next_page_url.find("v2.7/")+len("v2.7/"):]
    
    all_posts = graph.get_object(id=next_page_url)
    for post in all_posts["data"]:
        cur_date = format_datetime(post["created_time"])
        if cur_date > END_DATE:
            post_id_list.append(post["id"])

print "No. of posts:",len(post_id_list)
print "Date of oldest post:",format_datetime(all_posts["data"][29]["created_time"])

No. of posts: 850
Date of oldest post: 2014-01-18 00:17:07


In [11]:
# Loop through each post, retrieve relevant post information (message, likes count, reactions count, comments count)
# Store data in list. Each element is still in JSON format.
post_desc_list = []
fields_required = "created_time, message, shares, comments.limit(0).summary(true), likes.limit(0).summary(true),reactions.type(LOVE).limit(0).summary(total_count).as(reactions_love),reactions.type(WOW).limit(0).summary(total_count).as(reactions_wow),reactions.type(HAHA).limit(0).summary(total_count).as(reactions_haha),reactions.type(SAD).limit(0).summary(total_count).as(reactions_sad),reactions.type(ANGRY).limit(0).summary(total_count).as(reactions_angry)"

for ids in tqdm(post_id_list):
    post_desc_list.append(graph.get_object(id=ids, fields=fields_required))

100%|██████████| 850/850 [04:15<00:00,  3.89it/s]


In [12]:
# Loop through post_desc_list, Each interation, retrieve relevant info, create row and write into csv file

headers = ["No.","created_time","status","comments_count","shares_count","likes_count","angry_count","haha_count","love_count","sad_count","wow_count"]

with open('datafile_k_shanmugam.csv', 'wb') as f:
    w = csv.writer(f, delimiter=",")
    w.writerow(headers)
    count = 1
    for post_desc in post_desc_list:
        
        created_time = format_datetime(str(post_desc["created_time"]))
        
        try: 
            status_str = str(post_desc["message"].encode('utf8'))
        except KeyError:
            status_str = ""
        
        comments_count = str(post_desc["comments"]["summary"]["total_count"])
        
        try:
            shares_count = str(post_desc["shares"]["count"])
        except KeyError:
            shares_count = 0
        
        likes_count = str(post_desc["likes"]["summary"]["total_count"])
        angry_count = str(post_desc["reactions_angry"]["summary"]["total_count"])
        haha_count = str(post_desc["reactions_haha"]["summary"]["total_count"])
        love_count = str(post_desc["reactions_love"]["summary"]["total_count"])
        sad_count = str(post_desc["reactions_sad"]["summary"]["total_count"])
        wow_count = str(post_desc["reactions_wow"]["summary"]["total_count"])
        
        row_string = [count,created_time,status_str,comments_count,shares_count,likes_count,angry_count,haha_count,love_count,sad_count,wow_count]
        w.writerow(row_string)
        count += 1

print count-1,"posts' data stored in CSV successfully"

850 posts' data stored in CSV successfully
