In [125]:
# Data handling and processing
import os
import re
import time
import pandas as pd
import numpy as np
import statistics
import json
import csv
import sys
from datetime import datetime
from typing import List, Tuple, NamedTuple, Set, Dict, Any, Union, Optional
from pathlib import Path

# Scraping
import requests

# Plotting
import matplotlib as plt

In [133]:
# constants and Facebook API
ACCESS_TOKEN = "EAAMtZAjRm38kBO4lzhHI8KybWbZCUNSTvEuOporZBFJ591UVXF1yhbkwa9ZBYYq8N0zKVM2iJ5ZCXKfgCsgEgR17ZBHRIuAo6SjaOME1JyGxGJgpwqO2XfrthunrkbJIYbUyMgZAZCKcZCBuQ8zJu4iG2YoIjqr3uohwtqK5nZAYcDAsJhpBcuX8fmFsxW8iu1eWRWrk6XQ0d5xFDWRnzyvznrsZAJ1n1QQIOxaWQBZAek3k2P4ZD"
BASE_URL = "https://graph.facebook.com/v21.0/ads_archive/"
pd.set_option("display.max_columns", None)


In [141]:
def get_ads(id, access_token):
    all_data = []
    params = {
        'access_token': access_token,
        'ad_type': 'POLITICAL_AND_ISSUE_ADS',
        'search_page_ids': int(id),
        'fields': 'id,page_name,ad_creative_bodies,ad_delivery_start_time,ad_delivery_stop_time,impressions,spend,demographic_distribution,languages,publisher_platforms,bylines',
        'ad_reached_countries': 'US',
        'unmask_removed_content': 'true',
        'limit': 100
    }
    
    try:
        while True:
            response = requests.get('https://graph.facebook.com/v21.0/ads_archive/', params=params)
            data = response.json()
            
            if 'error' in data:
                print(f"API Error: {data['error']}")
            
            if 'data' in data:
                all_data.extend(data['data'])
            
            if 'paging' in data and 'next' in data['paging'] and 'after' in data['paging']['cursors']:
                params['after'] = data['paging']['cursors']['after']
            else:
                break

    except Exception as e:
        print(f"Error fetching ads for page {id}: {str(e)}")
        return None
        
    return {'data': all_data}

In [138]:
def process_candidate(name, page_id):
    print(f"Processing ads for {name} (Page ID: {page_id})")
    ads_data = []
    data = get_ads(page_id, ACCESS_TOKEN)
    if not data or 'data' not in data:
        print("Ads not found")
        return

    for ad in data['data']:
        imp_data = ad.get('impressions', {'lower_bound': '0', 'upper_bound': '0'})
        spend_data = ad.get('spend', {'lower_bound': '0', 'upper_bound': '0'})
        impressions = (int(imp_data.get('lower_bound', 0)) + int(imp_data.get('upper_bound', 0)) + 1) / 2
        cost = (int(spend_data.get('lower_bound', 0)) + int(spend_data.get('upper_bound', 0)) + 1) / 2

        entry = {
            'name': name,
            'page_id': page_id,
            'ad_id': ad.get('id', ''),
            'start_date': datetime.strptime(ad.get('ad_delivery_start_time', '2000-01-01'), '%Y-%m-%d').strftime("%Y-%m-%d") if ad.get('ad_delivery_start_time') else None,
            'end_date': datetime.strptime(ad.get('ad_delivery_stop_time', '2000-01-01'), '%Y-%m-%d').strftime("%Y-%m-%d") if ad.get('ad_delivery_stop_time') else None,
            'impressions': int(impressions),
            'cost': int(cost),
            'language': ad.get('languages', [''])[0],
            'is_facebook': 1 if 'facebook' in ad.get('publisher_platforms', []) else 0,
            'is_instagram': 1 if 'instagram' in ad.get('publisher_platforms', []) else 0,
            'byline': ad.get('bylines', '').lower(),
            'creative': ad.get('ad_creative_bodies', [''])[0] if ad.get('ad_creative_bodies') else ''
        }

        for gender in ['male', 'female', 'unknown']:
            for age_group in ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']:
                entry[f'share_{gender}_{age_group}'] = None

        for demo in ad.get('demographic_distribution', []):
            gender = demo.get('gender', '').lower()
            age = demo.get('age', '')
            if gender and age:
                entry[f'share_{gender}_{age}'] = float(demo.get('percentage', 0))
        ads_data.append(entry)
    return ads_data

In [129]:
df = pd.read_excel('data_raw/facebook_profiles.xlsx')
ads_data = []

# Process all candidates
for _, row in df.loc[218:219].iterrows():
    name = row['name_first_last']
    
    # Process page_ids if they exist
    if pd.notna(row['page_id']):
        page_ids = [row['page_id']]
        if pd.notna(row.get('page_id_2')):
            page_ids.append(row['page_id_2'])
        
        for page_id in page_ids:
            ads_data.extend(process_candidate(name, page_id))

# Convert to dataframe
df_ads = pd.DataFrame(ads_data)
df_ads = df_ads.replace({None: np.NaN})
df_ads.to_csv("data_clean/facebook_ads.csv", index=False)

Processing ads for kay ivey (Page ID: 105251911754539)
Processing ads for ken krawchuk (Page ID: 663297590359183)


In [142]:
id = '105251911754539'
get_ads(id, ACCESS_TOKEN)

API Error: {'message': '(#613) Calls to this api have exceeded the rate limit.', 'type': 'OAuthException', 'code': 613, 'fbtrace_id': 'AfZmF2udCG60t0TOC9RbH9W'}


{'data': []}