In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import csv
import time
import datetime as dt

#### This requires a FB username and pass

In [None]:
# user_request is the search term that is used to go search against FB's archive.
username = ""
password = ""
starting_url = "https://www.facebook.com/politicalcontentads/?active_status=all&q={}"
user_request = "North Carolina"

In [None]:
data = [[
        "user_request",
        "group_id", 
        "group_name", 
        "group_url", 
        "utc_start", 
        "utc_end", 
        "ad_duration", 
        "active", 
        "sponsored_by", 
        "impressions", 
        "money_spent",
        "description",
        "title",
        "title_description",
        "title_url",
        "media_type",
],[]]

to prevent popups blocking selenium, recommend reducing the browser to be less 3 or fewer ad columns. I usually make selenium use half of my monitor resolution and I have no issues.


In [None]:
driver.set_window_rect(height=1096, width=960, x=960, y=23)

### File Operations

In [10]:
def add_to_csv(data):
    """
    appends the data a csv. Expects an array object with more than 1 row.
    """
    now = dt.datetime.now().date().strftime("%Y-%m-%d")
    with open("data_{}_{}.csv".format(now), 'a', newline='') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
        for row in data:
            writer.writerow(row)

### Selenium Prep

You will need to download the chrome driver, https://sites.google.com/a/chromium.org/chromedriver/, and put it into the google folder. Update the path if you need to. I'm running this off of Mac. 

In [None]:
driver = webdriver.Chrome(executable_path="google/chromedriver")

In [11]:
def what_are_we_searching_for(user_query):
    """
    If the user is searching term is multiple words, 
    create a searchable term for the url. Returns a prepped url
    """
    temp = user_query.split(" ")
    cleaned_query = ""
    if len(temp)>1:
        for i in temp:
            cleaned_query += i + "%20"
    cleaned_query = cleaned_query[:-3]
    return cleaned_query

In [None]:
# wait for selenium to open and then navigate to starting url
time.sleep(3)
driver.get(starting_url.format(what_are_we_searching_for(user_request)))

In [None]:
def login():
    """ 
    FB requires an account and to be logged in in order to go through their archive. Yea!
    This function navigates the login page, inputs your account info, and logs in.
    """
    time.sleep(1)
    ele = driver.find_element_by_id("email")
    ele.send_keys(username)
    ele = driver.find_element_by_id("pass")
    ele.send_keys(password)
    ele = driver.find_element_by_id("loginbutton")
    ele.click()
    time.sleep(1)
login()

### Archive Scraping Functions

In [13]:
def find_similar_ads(ad_id, title):
    """
    Facebook looks to be categorizing some ads with a "similar ads" grouping. I've found it once but haven't seen it sense. 
    Trying to locate those ads so that I can build it into the tool.
    """
    search = ["Similar", "similar"]
    for i in search:
        search = driver.page_source.find("Similar")
        if search != -1:
            print("***FOUND SIMILAR ADS from group {} with title {}".format(ad_id, title))
        

In [None]:
def length_of_ad():
    """
    Calculates the duration of the ad. Looks for the date under the "Active"/"Inactive" section. 
    Dates can be read as "Started running on Jul 8. 2018" or two dates like "Jul 8, 2018 - Jul 1, 2018"
    We calculate the delta of these and return the start, end, num of days, and if it is still active or not.
    """
    time_running = driver.find_element_by_css_selector("body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hh0 > div._4hhk > div._4hhm > div._4hhv > span").text
    active = driver.find_element_by_css_selector("body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hh0 > div._4hhk > div._4hhm > div._4hhp > span").text
    utc_start = ""
    utc_end = ""
    number_of_days = ""
    
    
    if time_running.startswith("Started running"):
        if active == "Active":
            d = dt.datetime.strptime(time_running.split("on")[1], " %b %d, %Y")
            #get the start date of the ad
            utc_start = d.strftime("%Y-%m-%d")
            #calculate duration of currently running ad
            now = dt.datetime.now()
            num_days = now - d
            number_of_days = num_days.days

        elif active == "Inactive":
            d = dt.datetime.strptime(time_running.split("on")[1], " %b %d, %Y")
            utc_start = d.strftime("%Y-%m-%d")
            utc_end = utc_start
            number_of_days = "<1"
    else:
        start = dt.datetime.strptime(time_running.split(" - ")[0], "%b %d, %Y")
        end = dt.datetime.strptime(time_running.split(" - ")[1], "%b %d, %Y")
        num_days = end-start
        number_of_days = num_days.days
        if number_of_days == 0:
            number_of_days == 1
        utc_start = start.strftime("%Y-%m-%d")
        utc_end = end.strftime("%Y-%m-%d")
    
    return utc_start, utc_end, number_of_days, active

In [None]:
def exit_current_ad():
    """ when we are done grabbing the data off the ad, we need to close the ad. 
    This locates the "x" and then clicks it """
    exit_button = driver.find_element_by_css_selector("""body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > button > span._3n5s > i""")
    exit_button.click()

In [12]:
def get_title_details(group, group_id):
    """
    Have had more success grabbing the full div with all of the text and splitting it up afterwards
    vs the smaller sections. This function goes and grabs the title blerb, the stuff that appears
    below the media (image/video).
    """
    title = ""
    title_description = ""
    title_url = ""
    
    try:
        full_div = driver.find_element_by_css_selector("body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hgt > div > div > div._3pym._3b0i").text
#         title = driver.find_element_by_css_selector("""div._3pym._3b0i > div._3b0y > div._3pyn""").text
#         title_description = driver.find_element_by_css_selector("""div._3pym._3b0i > div._3b0y > div._3pyz > div > div""").text
#         title_url = driver.find_element_by_css_selector("""div._3pym._3b0i > div._3b0y > div._3py- > div > div""").text
        full_div = full_div.split("\n")
        title = full_div[0]
        title_description = full_div[1]
        title_url = full_div[2]
    except:
        print("No title information on ad {}, from group {}".format(group, group_id))
    
    return title, title_description, title_url

In [14]:
def find_all_ad_performance_elements():
    """Once we load onto the archive page, we need to grab all of the clickable links for the 
    political ads. FB loads 30 at a time.
    """
    found = driver.find_elements_by_css_selector("""div > div > div > a > div._235y""")
    return found

In [15]:
def get_next_chunk(found, seen):
    """
    Locates all of the found objects, after loading, and then returns only the 
    unique ads that we have not already visited.
    """
    #find the last element in found and navigate to it, pausing to load
    ele = found[-1]
    driver.execute_script("arguments[0].scrollIntoView();", ele)
    time.sleep(3)
    
    #after load, locate all of the elements
    full_found = find_all_ad_performance_elements()
    unique_list = [] #create empty list
    
    for element in full_found:
        if element.id not in seen:
            seen[element.id]=element
            unique_list.append(element)
    
    return unique_list, seen

In [16]:
def paid_for_data():
    # identify who paid for the ad
    try:
        paid_for = driver.find_element_by_css_selector("""div._6955._6956 > div._1kwg > div._1kwh > div._4ik4._4ik5 > div > span > span._21_y""").text
    except:
        paid_for = ""
    return paid_for

In [17]:
def description_data():
    """
    description is considered the texted the appears above the media.
    returns this text
    """
    description = [
        "div._4hgt > div > div > div._681i",
        "div._a28 > div > div > div > div._681i",
        "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hgt > div > div > div._681i > div"
    ]
    desc = ""
    for d in description:
        if desc == "":
            try:
                desc = driver.find_element_by_css_selector("{}".format(d)).text
            except:
                continue
    return desc

In [None]:
def get_media_type():
    # while in an ad, determine what type of ad was run, image, video, or text only
    # returns a string of these three types
    media_type = {
        "image": "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hgt > div > div > img",
        "video": "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hgt > div > div > div._1oad._1oax > div",
    }
    
    for i in media_type:
        try:
            media = driver.find_element_by_css_selector(media_type[i])
            return i
        except:
            continue
    return "text"
            

In [None]:
def navigate_to_search(user_request):
    try:
        search = driver.find_element_by_css_selector("#q")
        search.clear()
        search.send_keys(user_request)
        search.send_keys(Keys.ENTER)
    except:
        exit_current_ad()
        search = driver.find_element_by_css_selector("#q")
        search.clear()
        search.send_keys(user_request)
        search.send_keys(Keys.ENTER)

In [None]:
def get_text_through_looping(css_selectors):
    """allows for functions to pass a list of css_selectors to be iterated over to find the needed information.
    returns the found text, if any is found. If none, returns empty string.
    """
    temp_result = ""
    
    for i in css_selectors:
        try:
            temp_result = driver.find_element_by_css_selector(i).text
            return temp_result
        except:
            continue
        

In [None]:
def get_impressions():
    """
    Gets the number of impressions that this ad has received. If 'Impressions' string is 
    found in the results, it will split up the string to only return the impressions category. 
    """
    impress_css = [
        "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hh0 > div._4hhf > div._4hhg",
        "div > div > div > div._4hhg > div",
        "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hh0 > div._4hhf > div._4hhg > div"
    ]
    impressions = get_text_through_looping(impress_css)
    if "Impressions" in impressions:
        impressions = impressions.split("\n")[0]
    return impressions          

In [None]:
def get_money_spent():
    """
    Gets the money spent for this ad has. If 'Money spent' string is 
    found in the results, it will split up the string to only return the the money category
    """
    money_spent_css = [
        "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hh0 > div._4hhf > div._4hhh > div",
        "div > div > div > div._4hhh > div",
        "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hh0 > div._4hhf > div._4hhh > div > div",
    ]
    money_spent = get_text_through_looping(money_spent_css)
    if "Money spent" in money_spent:
        money_spent = money_spent.split("\n")[0]
    return money_spent

In [None]:
def get_sponsored():
    """
    goes out to locate the sponsors for this ad
    """
#     sponsored_css = [
#         "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hhe > div > span > span"
#         "div > div > div._4hhc > div > a",
#         "div._4hg_ > div._4hhe > div:nth-child(1) > span:nth-child(1)",
#         "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hhe > div:nth-child(1) > span:nth-child(2) > span",
#         "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hhe > div:nth-child(1)",
#         "body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hg_ > div._4hhe > div:nth-child(1) > span:nth-child(1)"
#     ]
#     sponsored = get_text_through_looping(sponsored_css)

    sponsored = driver.find_element_by_css_selector("body > div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hgt > div > div > div._6955._6956 > div").text
    try:
        sponsored = sponsored.split("Paid for by ")[1]
    except:
        sponsored = ""
    return sponsored

In [None]:
def get_group_id_url():
    """
    sometimes the group name is different from the sponsored name. 
    This is the information that appears in the top left in blue. The ID is a
    split off of the href.
    """
    ### gets the advertising group's name, ID, and url for future use
    
    group_name_css = [
        "div._10._d2i.uiLayer._4-hy._3qw > div._59s7 > div > div > div > div > div > div > div._jmh > div > div > div._4hgt > div > div > div._6955._6956 > div > span",
        "div > div > div._4hhc > div > a",
    ]
    group_name = get_text_through_looping(group_name_css)
    #get the url and get the id
    t = driver.find_element_by_partial_link_text("View all ads with political content from this Page")
    url = t.get_attribute("href")
    group_id = url.split("=")[1]
    
    return group_id, group_name, url

In [None]:
def get_data_off_of_page(found, seen, user_request):
    """
    Function that scrolls through all of the ads, appends the data, and writes the data
    """
    time.sleep(3)
    data = []
    rows = 1
    
    for i in found:
        time.sleep(2)
        try:
            i.click()
        except:
            driver.execute_script("arguments[0].scrollIntoView();", i)
            i.click()
        
        group_id, group_name, group_url = get_group_id_url()
        
        utc_start, utc_end, ad_duration, active = length_of_ad()
        paid_for = paid_for_data()
        sponsored = get_sponsored()
#         print("GROUP = {}, {}, {}, {}".format(group_id, group_name, group_url, sponsored))
        
        impressions = get_impressions()
        money_spent = get_money_spent()
        description = description_data()
        media_type = get_media_type()
        title, title_description, title_url = get_title_details(group_id, group_name, group_id)
        find_similar_ads(group_id, description) #trying to find the elusive "here are similar ads"
        if description == "":
            print("Description broke on {}, {}".format(group_name, description))
        time.sleep(2)

        exit_current_ad()
        data.append([
            user_request,
            group_id, 
            group_name, 
            group_url, 
            utc_start, 
            utc_end, 
            ad_duration, 
            active, 
            sponsored, 
            impressions, 
            money_spent,
            description,
            title,
            title_description, 
            title_url,
            media_type,
        ])
        #if we just started, load the first 30 rows into seen
        if rows<=30:
            seen[i.id]=i

        #add 10 rows of data at a time to csv
        if not rows%10:
            add_to_csv(data)
            data = []
            print("Saving 10 lines")

        #wait for next 30 to load
        if not rows%30:
            time.sleep(5)
            next_chunk, seen = get_next_chunk(found, seen)
            print("1 LEN OF FOUND {}".format(len(next_chunk)))
            add_to_csv(data)
            get_data_off_of_page(next_chunk, seen, user_request)
        
        rows +=1
    add_to_csv(data)

### Start the process
The below is what kicks off the code and starts the crawl

In [None]:
seen = {}
driver.refresh()
time.sleep(2)
add_to_csv(data, user_request)
found = find_all_ad_performance_elements()
get_data_off_of_page(found, seen, user_request)