This repository has been archived by the owner on Jan 8, 2019. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Justin Littman
committed
Jun 3, 2018
1 parent
1bbed3d
commit 7c3f778
Showing
4 changed files
with
305 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -102,3 +102,5 @@ venv.bak/ | |
|
||
# mypy | ||
.mypy_cache/ | ||
|
||
.idea/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,58 @@ | ||
# fb-ad-archive-scraper | ||
Scraper for Facebook's Archive of Ads with Political Content | ||
Scraper for Facebook's [Archive of Ads with Political Content](https://www.facebook.com/politicalcontentads) _... until Facebook provides an API._ | ||
|
||
fb-ad-archive-scraper will produce: | ||
* CSV containing the text and metadata of the ads. | ||
* Screenshots of each ad. | ||
* A README file. | ||
|
||
Like any scraper, fb-ad-archive-scraper is fragile. It will break if Facebook changes the structure / code of the | ||
Archive. If fb-ad-archive-scraper breaks, let me know. | ||
|
||
Tickets / PRs are welcome. | ||
|
||
## Install | ||
1. Clone the repo: | ||
|
||
git clone https://github.com/justinlittman/fb-ad-archive-scraper.git | ||
|
||
2. Change to the directory: | ||
|
||
cd fb-ad-archive-scraper | ||
|
||
3. Optionally, create a virtual environment: | ||
|
||
virtualenv -p python3 ENV | ||
source ENV/bin/activate | ||
4. Install requirements: | ||
|
||
pip install -r requirements.txt | ||
5. [Install Chromedriver](https://sites.google.com/a/chromium.org/chromedriver/). On a Mac, this is: | ||
|
||
brew install chromedriver | ||
## Usage | ||
|
||
usage: scraper.py [-h] [--limit LIMIT] email password query [query ...] | ||
Scrape Facebook's Archive of Ads with Political Content | ||
positional arguments: | ||
email Email address for FB account | ||
password Password for FB account | ||
query Query | ||
optional arguments: | ||
-h, --help show this help message and exit | ||
--limit LIMIT Limit on number of ads to scrape | ||
|
||
For example: | ||
|
||
python scraper.py fbuser@gmail.com password pelosi | ||
Notes: | ||
* fb-ad-archive-scraper uses a headless Chrome browser. This means that you will not see the browser at work. | ||
* The output of each run will be placed in a separate directory and include a README, CSV file, and PNG images. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Pillow==5.1.0 | ||
selenium==3.12.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,244 @@ | ||
from time import sleep | ||
from selenium import webdriver | ||
from selenium.common.exceptions import NoSuchElementException | ||
from PIL import Image | ||
from io import BytesIO | ||
from collections import deque | ||
from urllib.parse import urlencode | ||
from datetime import datetime | ||
import os | ||
import csv | ||
import argparse | ||
|
||
|
||
def find_ad_class(driver): | ||
divs = deque([driver.find_element_by_id('content')]) | ||
while divs: | ||
div = divs.popleft() | ||
if '1px solid rgb(233, 234, 235)' == div.value_of_css_property('border'): | ||
return div.get_attribute('class') | ||
divs.extend(div.find_elements_by_xpath('div')) | ||
return None | ||
|
||
|
||
def find_topnav_div(driver): | ||
divs = deque([driver.find_element_by_id('content')]) | ||
while divs: | ||
div = divs.popleft() | ||
if 'fixed' == div.value_of_css_property('position'): | ||
return div | ||
divs.extend(div.find_elements_by_xpath('div')) | ||
return None | ||
|
||
|
||
def find_next_link(driver): | ||
try: | ||
return driver.find_element_by_link_text('See More') | ||
except NoSuchElementException: | ||
return None | ||
|
||
|
||
def blank_ad(): | ||
return {'ad_count': None, | ||
'page': None, | ||
'is_active': None, | ||
'start': None, | ||
'end': None, | ||
'paid_for_by': None, | ||
'title': None, | ||
'text': None} | ||
|
||
|
||
def process_ad_divs(ad_divs, ad_count, page_count, driver, writer, dirname, ad_limit): | ||
# Add whitespace to bottom to allow scrolling to bottom row | ||
window_height = driver.execute_script('return window.innerHeight') | ||
driver.execute_script("arguments[0].setAttribute('style', 'margin-bottom:{}px;')".format(window_height), | ||
ad_divs[-1]) | ||
for ad_div in ad_divs: | ||
ad_count += 1 | ||
print('Ad {}'.format(ad_count)) | ||
screenshot(ad_div, ad_count, dirname, driver) | ||
ad = blank_ad() | ||
ad['ad_count'] = ad_count | ||
ad['page'] = page_count | ||
ad['title'] = ad_div.find_element_by_xpath('.//a[text()]').text | ||
for span in ad_div.find_elements_by_xpath('.//span[text()]'): | ||
if span.text.startswith('Paid for by '): | ||
ad['paid_for_by'] = span.text[12:] | ||
for pos, div in enumerate(ad_div.find_elements_by_xpath('.//div[text()]')): | ||
if pos == 0: | ||
ad['is_active'] = div.text == 'Active' | ||
elif pos == 1: | ||
if div.text.startswith('Started running on '): | ||
ad['start'] = div.text[19:] | ||
else: | ||
split_text = div.text.split(' - ') | ||
ad['start'] = split_text[0] | ||
ad['end'] = split_text[1] | ||
elif not ('See Ad Performance' in div.text or (pos == 2 and div.text.startswith('Sponsored'))): | ||
if ad['text'] is None: | ||
ad['text'] = div.text.replace('\n', ' ') | ||
else: | ||
ad['text'] = ' '.join((ad['text'], div.text.replace('\n', ' '))) | ||
writer.writerow(ad) | ||
if ad_limit == ad_count: | ||
break | ||
|
||
return ad_count | ||
|
||
|
||
def class_to_css_selector(clazz): | ||
# This handles compound class names. | ||
return ".{}".format(clazz.replace(' ', '.')) | ||
|
||
|
||
def screenshot(ad_div, ad_count, dirname, driver): | ||
window_height = driver.execute_script('return window.innerHeight') | ||
ad_top = ad_div.location['y'] | ||
ad_height = ad_div.size['height'] | ||
ad_bottom = ad_top + ad_height | ||
ad_left = ad_div.location['x'] | ||
ad_right = ad_left + ad_div.size['width'] | ||
|
||
offset = ad_top | ||
slices = [] | ||
img_height = 0 | ||
while offset < ad_bottom: | ||
driver.execute_script("window.scrollTo(0, %s);" % offset) | ||
img = Image.open(BytesIO(driver.get_screenshot_as_png())) | ||
img_height += img.size[1] | ||
slices.append(img) | ||
offset += window_height | ||
|
||
screenshot_img = Image.new('RGB', (slices[0].size[0], img_height)) | ||
offset = 0 | ||
|
||
for img in slices: | ||
screenshot_img.paste(img, (0, offset)) | ||
offset += img.size[1] | ||
|
||
screenshot_img.crop((ad_left * 2, 0, ad_right * 2, ad_height * 2)).save('{}/ad-{:04}.png'.format(dirname, ad_count)) | ||
|
||
def write_readme(dirname, timestamp, q, limit): | ||
with open('{}/README.txt'.format(dirname), 'w') as readme: | ||
readme.write('Scrape of Facebook Archive of Ads with Political Content\n') | ||
readme.write('Performed by fb-ad-archive-scraper (https://github.com/justinlittman/fb-ad-archive-scraper).\n\n') | ||
readme.write('Query: {}\n'.format(q)) | ||
readme.write('Started: {}\n'.format(timestamp.isoformat())) | ||
if limit: | ||
readme.write('Limit: {}'.format(limit)) | ||
|
||
|
||
|
||
def fullpage_screenshot(driver, filename): | ||
scrollheight = driver.execute_script('return Math.max( document.body.scrollHeight, ' | ||
'document.body.offsetHeight, ' | ||
'document.documentElement.clientHeight, ' | ||
'document.documentElement.scrollHeight, ' | ||
'document.documentElement.offsetHeight);') | ||
windowheight = driver.execute_script('return window.innerHeight') | ||
slices = [] | ||
offset = 0 | ||
imgheight = 0 | ||
last_offset = 0 | ||
while offset < scrollheight: | ||
driver.execute_script("window.scrollTo(0, %s);" % offset) | ||
img = Image.open(BytesIO(driver.get_screenshot_as_png())) | ||
imgheight += img.size[1] | ||
last_offset = offset | ||
offset += windowheight | ||
slices.append(img) | ||
|
||
screenshot_img = Image.new('RGB', (slices[0].size[0], imgheight)) | ||
offset = 0 | ||
# Crop the last image. | ||
width = slices[0].size[0] | ||
height = slices[0].size[1] | ||
last_height = (scrollheight - last_offset) * 2 | ||
slices[-1] = slices[-1].crop((0, height - last_height, width, height)) | ||
|
||
for img in slices: | ||
screenshot_img.paste(img, (0, offset)) | ||
offset += img.size[1] | ||
|
||
screenshot_img.save(filename) | ||
|
||
|
||
def main(q, fb_email, fb_password, ad_limit=None): | ||
timestamp = datetime.now() | ||
# Create directory | ||
dirname = '{}-{}'.format(q.replace(' ', '_'), timestamp.strftime('%Y%m%d%H%M%S')) | ||
os.makedirs(dirname) | ||
write_readme(dirname, timestamp, q, ad_limit) | ||
|
||
options = webdriver.ChromeOptions() | ||
options.add_argument("headless") | ||
driver = webdriver.Chrome(options=options) | ||
driver.implicitly_wait(10) | ||
try: | ||
driver.get( | ||
'https://www.facebook.com/politicalcontentads/?{}'.format(urlencode({'active_status': 'all', 'q': q}))) | ||
driver.find_element_by_name('email').send_keys(fb_email) | ||
driver.find_element_by_name('pass').send_keys(fb_password) | ||
driver.find_element_by_name('login').click() | ||
sleep(5) | ||
|
||
try: | ||
driver.find_element_by_xpath('//span[text()="Log into Facebook"]') | ||
print('Login failed') | ||
return | ||
except NoSuchElementException: | ||
print('Login succeeded') | ||
|
||
# Has results | ||
try: | ||
driver.find_element_by_xpath('//div[contains(text(),"There are no ads matching")]') | ||
print('No results') | ||
return | ||
except NoSuchElementException: | ||
pass | ||
|
||
# Fix topnav for screenshots | ||
topnav_div = find_topnav_div(driver) | ||
assert topnav_div | ||
driver.execute_script("arguments[0].setAttribute('style', 'position: absolute; top: 0px;')", topnav_div) | ||
|
||
# Find the ad class | ||
print('Finding ad class') | ||
ad_clazz = find_ad_class(driver) | ||
assert ad_clazz | ||
|
||
with open('{}/ads.csv'.format(dirname), 'w') as csv_file: | ||
writer = csv.DictWriter(csv_file, fieldnames=blank_ad().keys()) | ||
writer.writeheader() | ||
|
||
page = 1 | ||
ad_count = 0 | ||
ad_divs = driver.find_elements_by_css_selector(class_to_css_selector(ad_clazz)) | ||
print("Processing {} ads on page {}".format(len(ad_divs), page)) | ||
ad_count = process_ad_divs(ad_divs, ad_count, page, driver, writer, dirname, ad_limit) | ||
next_link = find_next_link(driver) | ||
while next_link and ad_limit != ad_count: | ||
driver.execute_script("return arguments[0].scrollIntoView(true);", next_link) | ||
next_link.click() | ||
page += 1 | ||
sleep(5) | ||
ad_divs = driver.find_elements_by_css_selector(class_to_css_selector(ad_clazz)) | ||
print("Processing {} ads on page {}".format(len(ad_divs) - ad_count, page)) | ||
ad_count = process_ad_divs(ad_divs[ad_count:], ad_count, page, driver, writer, dirname, ad_limit) | ||
next_link = find_next_link(driver) | ||
|
||
finally: | ||
driver.close() | ||
driver.quit() | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Scrape Facebook\'s Archive of Ads with Political Content') | ||
parser.add_argument('email', help='Email address for FB account') | ||
parser.add_argument('password', help='Password for FB account') | ||
parser.add_argument('query', help='Query', nargs='+') | ||
parser.add_argument('--limit', help='Limit on number of ads to scrape', type=int) | ||
|
||
args = parser.parse_args() | ||
main(' '.join(args.query), args.email, args.password, ad_limit=args.limit) |