# Google Notebook
Purpose is to use Selenium to query names to test for ads via Bing.<br>
By: Elsie Wang<br>
Date: 11/4/23

## Overhead

In [1]:
# Imports
import re
import logging
import os
import time

import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm.notebook import tqdm

from util import *

In [2]:
# Setup logger
if not os.path.exists('../logs/'):
    os.mkdir('../logs/')
if not os.path.exists('../logs/bing-pilot.log'):
    open('../logs/bing-pilot.log', 'a').close()

logging.basicConfig(
    filename='../logs/bing-pilot.log',
    filemode='w',
    format='%(asctime)s %(message)s',
    datefmt='%m/%d/%Y %I:%M:%S %p',
    level=logging.DEBUG
)

In [3]:
# Setup Webdriver Options
options = Options()

options.add_argument("--disable-notifications")
options.add_argument("--incognito")
options.add_experimental_option("prefs", {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}]})
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features")
options.add_argument('--disable-extensions')
options.add_argument("--no-sandbox")

options.binary_location = "/usr/local/bin/webdriver"

In [8]:
# Setup Selenium
print("Internet connected: ", internet())
driver = webdriver.Chrome(options=options)

Internet connected:  True


## Collection

### Generating Names

In [5]:
# Init names
path = "./../data/pilot-names.json"
white_female_names = generate_names(path, 'white', 'female')
white_male_names = generate_names(path, 'white', 'male')
black_female_names = generate_names(path, 'black', 'female')
black_male_names = generate_names(path, 'black', 'male')
all_names = {
    "wf": white_female_names,
    "wm": white_male_names,
    "bf": black_female_names,
    "bm": black_male_names
}

### Querying

In [13]:
def parse_google_ads(raw_html, query):
    """ Parses the doman.TLD and the title, URL from the HTML
    """
    compiled = defaultdict(list)
    pattern = r'(?:http[s]?://)?(?:www\.)?([\w-]+\.[\w-]+)' # Matches the domain and TLD of a URL
    ads = BeautifulSoup(raw_html).select('div[data-text-ad="1"]')
    for ad in ads:
        try:
            title = ad.select_one('div[aria-level="3"]').text
            link = ad.select_one('a')['href']
            match = re.search(pattern, link)
            domain = match.group(1) if match else 'ERROR'
            compiled[domain].append((title, link))
        except Exception as e:
            logging.debug(f'Failed to parse ad HTML on query: {query}')
    return compiled

In [14]:
# Querying all names and obtaining the ads on the page (est. 4min)
all_ads = [['Name', 'Group', 'Ad Domain', 'Ad Title', 'Ad Link']]
for group, names in tqdm(all_names.items()):
    for name in tqdm(names):
        driver = webdriver.Chrome(options=options)
        query = f"{name} public records"
        raw_html = google_search(query, driver)
        parsed = parse_google_ads(raw_html, query)
        for domain, ad_items in parsed.items():
            for ad_opts in ad_items:
                all_ads.append([name, group, domain, ad_opts[0], ad_opts[1]])
        driver.close()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

## Analysis

### Labelling

In [20]:
# Init DF
df = pd.DataFrame(all_ads[1:], columns=all_ads[0])

In [21]:
# Label
mapping = {'wf': ('White', 'Female'), 'wm': ('White', 'Male'), 'bf': ('Black', 'Female'), 'bm': ('Black', 'Male')}
df['Race'] = df['Group'].map(mapping).str[0]
df['Sex'] = df['Group'].map(mapping).str[1]
df['Criminal Ad'] = df['Ad Title'].str.lower().str.contains('criminal')

In [22]:
# Showing dataframe
df.head()

Unnamed: 0,Name,Group,Ad Domain,Ad Title,Ad Link,Race,Sex,Criminal Ad
0,Allison Smith,wf,truthfinder.com,Public Background Check - Just Type in a Name ...,https://www.truthfinder.com/p/home/,White,Female,False
1,Allison Smith,wf,publicrecords.com,Online Public Records - Just Enter A Name And ...,https://www.publicrecords.com/,White,Female,False
2,Allison Smith,wf,publicrecordreports.com,Absolutely free public records - Just Type in ...,https://www.publicrecordreports.com/people/index,White,Female,False
3,Allison Smith,wf,ourpublicrecords.org,Free Public Arrest Records - View Records With...,https://ourpublicrecords.org/arrest-records/,White,Female,False
4,Allison Johnson,wf,truthfinder.com,Public Records Search - Just Type in a Name & ...,https://www.truthfinder.com/p/home/,White,Female,False


### Breakdown

In [23]:
# Show breakdowns
display(df.groupby(['Race'])['Criminal Ad'].count().to_frame())
display(df.groupby(['Race', 'Sex'])['Criminal Ad'].count().to_frame())

Unnamed: 0_level_0,Criminal Ad
Race,Unnamed: 1_level_1
Black,646
White,552


Unnamed: 0_level_0,Unnamed: 1_level_0,Criminal Ad
Race,Sex,Unnamed: 2_level_1
Black,Female,348
Black,Male,298
White,Female,271
White,Male,281
