# Bing Pilot Notebook
Purpose is to use Selenium to query names to test for ads via Bing.<br>
By: Jonathan Lo<br>
Date: 11/2/23

## Overhead

In [213]:
# Imports
import re
import logging
import os

import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm.notebook import tqdm

from util import *
from matplotlib import pyplot as plt
import scipy.stats as stats

In [214]:
# Setup logger
if not os.path.exists('../logs/'):
    os.mkdir('../logs/')
if not os.path.exists('../logs/bing-pilot.log'):
    open('../logs/bing-pilot.log', 'a').close()

logging.basicConfig(
    filename='../logs/bing-pilot.log',
    filemode='w',
    format='%(asctime)s %(message)s',
    datefmt='%m/%d/%Y %I:%M:%S %p',
    level=logging.DEBUG
)

In [215]:
# Setup Webdriver Options
options = Options()

options.add_argument("--disable-notifications")
options.add_argument("--incognito")
options.add_experimental_option("prefs", {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}]})
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features")
options.add_argument('--disable-extensions')
options.add_argument("--no-sandbox")

options.binary_location = "/usr/local/bin/webdriver"

In [216]:
# Setup Selenium
print("Internet connected: ", internet())
driver = webdriver.Chrome(options=options)

Internet connected:  True


## Collection

### Generating Names

In [60]:
#Load in dataset of NYC's most popular baby names by gender and race

first_name_path = "./../data/Popular_Baby_Names.csv"
name_df = pd.read_csv(first_name_path)
name_df.Ethnicity.value_counts(dropna=False)

Ethnicity
HISPANIC                      16930
WHITE NON HISPANIC            16127
BLACK NON HISPANIC             8335
ASIAN AND PACIFIC ISLANDER     7830
WHITE NON HISP                 4142
ASIAN AND PACI                 2125
BLACK NON HISP                 2093
Name: count, dtype: int64

In [61]:
#somewhat arbitrary filter for names occuring more than 25 times

name_df = name_df[name_df['Count'] > 25]

In [62]:
#filter out black and white names only

white_names = name_df[name_df.Ethnicity.str.contains('WHITE')]
black_names = name_df[name_df.Ethnicity.str.contains('BLACK')]

In [63]:
#group by gender

bm_names = black_names[black_names.Gender == 'MALE']
wm_names = white_names[white_names.Gender == 'MALE']
bf_names = black_names[black_names.Gender == 'FEMALE']
wf_names = white_names[white_names.Gender == 'FEMALE']

In [64]:
#sort each one by popularity and get only unique values (same values for different years in dataset)

bm_names_sorted = bm_names.sort_values(by='Count', ascending=False)
wm_names_sorted = wm_names.sort_values(by='Count', ascending=False)
bf_names_sorted = bf_names.sort_values(by='Count', ascending=False)
wf_names_sorted = wf_names.sort_values(by='Count', ascending=False)
unique_bm = bm_names_sorted["Child's First Name"].str.lower().str.capitalize().unique()
unique_wm = wm_names_sorted["Child's First Name"].str.lower().str.capitalize().unique()
unique_bf = bf_names_sorted["Child's First Name"].str.lower().str.capitalize().unique()
unique_wf = wf_names_sorted["Child's First Name"].str.lower().str.capitalize().unique()

In [65]:
#take out the intersection of each of the lists so that end lists will have only names unique
#to that gender + race combo

arrays = [unique_bm, unique_wm, unique_bf, unique_wf]
duplicates = set()
for i in range(len(arrays)):
    for j in range(i+1, len(arrays)):
        duplicates.update(set(arrays[i]).intersection(arrays[j]))
        unique_arrays = []
for array in arrays:
    unique_array = [item for item in array if item not in duplicates]
    unique_arrays.append(unique_array)
bm = unique_arrays[0]
wm = unique_arrays[1]
bf = unique_arrays[2]
wf = unique_arrays[3]

In [66]:
#merge new lists with old pilot-names dataset from original Sweeney study

f = open("./../data/pilot-names.json")
name_data = json.load(f)
wf1 = name_data['white']['first']['female']
bf1 = name_data['black']['first']['female']
wm1 = name_data['white']['first']['male']
bm1 = name_data['black']['first']['male']
wf_fn = list(set(wf1 + wf))
bf_fn = list(set(bf1 + bf))
wm_fn = list(set(wm1 + wm))
bm_fn = list(set(bm1 + bm))
min_fn = min(len(wf_fn), len(bf_fn), len(wm_fn), len(bm_fn))

In [67]:
print(f'bm # of unique first names: {len(bm_fn)}\nwm # of unique first names: {len(wm_fn)}\nbf # of unique first names: {len(bf_fn)}\nwf # of unique first names: {len(wf_fn)}')

bm # of unique first names: 59
wm # of unique first names: 168
bf # of unique first names: 68
wf # of unique first names: 178


In [68]:
#optional step that makes the amount of names equal for each group, cutting off least popular names

wf_final = wf_fn[:min_fn]
wm_final = wm_fn[:min_fn]
bf_final = bf_fn[:min_fn]
bm_final = bm_fn[:min_fn]

In [69]:
#messy, manual way of putting data back into the format for generate_names function

white_last = [
            "Smith",
            "Johnson",
            "Miller",
            "Brown",
            "Jones",
            "Williams",
            "Davis",
            "Anderson",
            "Wilson",
            "Martin",
            "Taylor"
        ]
black_last = [
            "Williams",
            "Johnson",
            "Smith",
            "Jones",
            "Brown",
            "Jackson",
            "Davis",
            "Thomas",
            "Harris",
            "Robinson",
            "Taylor"
        ]
name_dict = {}
name_dict['white'] = {}
name_dict['white']['first'] = {}
name_dict['white']['first']['female'] = wf_final
name_dict['white']['first']['male'] = wm_final
name_dict['black'] = {}
name_dict['black']['first'] = {}
name_dict['black']['first']['female'] = bf_final
name_dict['black']['first']['male'] = bm_final
name_dict['black']['last'] = black_last
name_dict['white']['last'] = white_last

In [70]:
def generate_names(race, sex):
    names = list()
    for first_name in name_dict[race]['first'][sex]:
        for last_name in name_dict[race]['last']:
            names.append(f"{first_name} {last_name}")
    return names

In [71]:
# Init names
white_female_names = generate_names('white', 'female')
white_male_names = generate_names('white', 'male')
black_female_names = generate_names('black', 'female')
black_male_names = generate_names('black', 'male')
all_names = {
    "wf": white_female_names,
    "wm": white_male_names,
    "bf": black_female_names,
    "bm": black_male_names
}

In [72]:
len(all_names['wf'])

649

### Querying

In [73]:
def parse_bing_ads(raw_html, query):
    """ Parses the doman.TLD and the title, URL from the HTML
    """
    compiled = defaultdict(list)
    pattern = r'(?:http[s]?://)?(?:www\.)?([\w-]+\.[\w-]+)' # Matches the domain and TLD of a URL
    ads = BeautifulSoup(raw_html).select('.sb_add')
    for ad in ads:
        try:
            title = ad.select_one('h2').text
            link = ad.select_one('.b_adurl').text
            match = re.search(pattern, link)
            domain = match.group(1) if match else 'ERROR'
            compiled[domain].append((title, link))
        except Exception as e:
            logging.debug(f'Failed to parse ad HTML on query: {query}')
    return compiled

In [74]:
# Querying all names and obtaining the ads on the page (est. 4min)
all_ads = [['Name', 'Group', 'Ad Domain', 'Ad Title', 'Ad Link']]
for group, names in tqdm(all_names.items()):
    for name in tqdm(names):
        query = f"{name} public records"
        raw_html = bing_search(query, driver)
        parsed = parse_bing_ads(raw_html, query)
        for domain, ad_items in parsed.items():
            for ad_opts in ad_items:
                all_ads.append([name, group, domain, ad_opts[0], ad_opts[1]])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/649 [00:00<?, ?it/s]

  0%|          | 0/649 [00:00<?, ?it/s]

  0%|          | 0/649 [00:00<?, ?it/s]

  0%|          | 0/649 [00:00<?, ?it/s]

In [82]:
len(all_ads)

6983

## Analysis

### Labelling

In [83]:
# Init DF
df = pd.DataFrame(all_ads[1:], columns=all_ads[0])

In [84]:
# Label
mapping = {'wf': ('White', 'Female'), 'wm': ('White', 'Male'), 'bf': ('Black', 'Female'), 'bm': ('Black', 'Male')}
df['Race'] = df['Group'].map(mapping).str[0]
df['Sex'] = df['Group'].map(mapping).str[1]
df['Criminal Ad'] = df['Ad Title'].str.lower().str.contains('criminal')

# For more search term options
df['Criminal Ad'] = df['Ad Title'].str.lower().str.contains('criminal|jail|prison|arrest')

In [104]:
# Showing dataframe
df.head()

Unnamed: 0,Name,Group,Ad Domain,Ad Title,Ad Link,Race,Sex,Criminal Ad
0,Dina Smith,wf,publicrecords.info,Search Public Records | Public Records For State,https://www.publicrecords.info/Birth/Records,White,Female,False
1,Dina Smith,wf,publicrecords.info,Search Public Records | Public Records For State,https://www.publicrecords.info/Birth/Records,White,Female,False
2,Dina Smith,wf,ancestry.com,Search Billions of Records | Find Your Ancesto...,https://www.ancestry.com/ancestry,White,Female,False
3,Dina Smith,wf,ancestry.com,Search Billions of Records | Find Your Ancesto...,https://www.ancestry.com/ancestry,White,Female,False
4,Dina Smith,wf,spokeo.com,Search Public Records - Free | Lookup Public R...,https://www.spokeo.com/Public/Records,White,Female,False


In [78]:
breakdown_regular = df.groupby('Group')['Criminal Ad'].count()

In [79]:
only_criminal = df[df['Criminal Ad']]

In [80]:
breakdown_criminal = only_criminal.groupby('Group')['Criminal Ad'].count()

In [115]:
#prep for racial + gender group statistical test

cont_table = [[breakdown_criminal['bf'], breakdown_regular['bf']], 
              [breakdown_criminal['bm'], breakdown_regular['bm']], 
              [breakdown_criminal['wf'], breakdown_regular['wf']], 
              [breakdown_criminal['wm'], breakdown_regular['wm']]]
cont_table

[[26, 1875], [36, 1800], [36, 1694], [46, 1613]]

In [157]:
#prep for racial group statistical test

obs = [df.groupby('Race')['Criminal Ad'].count()['Black'], df.groupby('Race')['Criminal Ad'].count()['White']]
successes = [only_criminal.groupby('Race')['Criminal Ad'].count()['Black'], only_criminal.groupby('Race')['Criminal Ad'].count()['White']]

In [191]:
#prep for gender group statistical test

obs_gender = [df.groupby('Sex')['Criminal Ad'].count()['Female'], df.groupby('Sex')['Criminal Ad'].count()['Male']]
successes_gender = [only_criminal.groupby('Sex')['Criminal Ad'].count()['Female'], only_criminal.groupby('Sex')['Criminal Ad'].count()['Male']]

In [164]:
#Chi-Square Test
#To compare the rates of criminal ads and regular ads shown to each gender + racial group.


#Null Hypothesis: All four proportions come from the same distribution (i.e. they are equal)
#Alternative Hypothesis: All four proportions are not the same.


chi2, p, dof, expected = stats.chi2_contingency(cont_table)
if p < 0.05:
    print(f"Grouped by race and gender:\nWith a p-value of {round(p, 3)}, we reject the null hypothesis. There is statistically significant difference between the proportions.")
else:
    print(f"Grouped by race and gender:\nWith a p-value of {round(p, 3)}, we fail to reject the null hypothesis. There is no statistically significant difference in the proportions.")

Grouped by race and gender
With a p-value of 0.031, we reject the null hypothesis. There is statistically significant difference between the proportions.


In [188]:
def two_prop_z_test(s, o):
    p1 = s[0] / o[0]
    p2 = s[1] / o[1]
    pooled_p = np.sum(s) / np.sum(o)
    se = np.sqrt(pooled_p * (1 - pooled_p) * ((1 / o[0]) + (1 / o[1])))
    z = (p1 - p2) / se
    p_val = 2 * (1 - stats.norm.cdf(np.abs(z)))
    return p_val

In [189]:
#Two-Proportion Z Test
#To compare just racial groups criminal ads vs. regular ads


#Null Hypothesis: The two proportions are the same
#Alternative Hypothesis: The two proportions are different


p_val = two_prop_z_test(successes, obs)
if p_val < 0.05:
    print(f"Grouped by race:\nWith a p-value of {round(p_val, 3)}, we reject the null hypothesis. There is a statistically significant difference between the two proportions.")
else:
    print(f"Grouped by race:\nWith a p-value of {round(p_val, 3)}, we fail to reject the null hypothesis. There is no statistically significant difference between the two proportions.")

Grouped by race:
With a p-value of 0.02, we reject the null hypothesis. There is a statistically significant difference between the two proportions.


In [192]:
#Two-Proportion Z Test
#To compare just gender groups criminal ads vs. regular ads


#Null Hypothesis: The two proportions are the same
#Alternative Hypothesis: The two proportions are different


p_val2 = two_prop_z_test(successes_gender, obs_gender)
if p_val2 < 0.05:
    print(f"Grouped by gender:\nWith a p-value of {round(p_val2, 3)}, we reject the null hypothesis. There is a statistically significant difference between the two proportions.")
else:
    print(f"Grouped by gender:\nWith a p-value of {round(p_val2, 3)}, we fail to reject the null hypothesis. There is no statistically significant difference between the two proportions.")

Grouped by gender:
With a p-value of 0.051, we fail to reject the null hypothesis. There is no statistically significant difference between the two proportions.


In [212]:
#  same hypothesis tests for the top 3 ad domains represented in the dataset.

#  A rejection of the null hypothesis means that this ad domain is over/under representing a gender, racial,
#  or combined gender + racial group based on what was seen in the original dataset.

#  For example, if we reject the null hypothesis for the domain spokeo.com and the racial grouping, this means
#  spokeo.com has a statistically significantly higher proportion of ads for one racial group (black or white) 
#  than the proportion that was seen in the original dataset.

#  A fail to reject result means that no significant difference was detected between the distribution of ads that
#  just appeared from this domain and the distribution that appeared in the original dataset.



for i in df['Ad Domain'].value_counts().index[:3]:
    print(f"Ad Domain: {i}")
    
    #prep for both two-prop z tests
    df_filt = df[df['Ad Domain'] == i]
    print(f"{len(df_filt)} ads from this domain")
    racial_obs = [df.groupby('Race')['Criminal Ad'].count()['Black'], df.groupby('Race')['Criminal Ad'].count()['White']]
    racial_successes = [df_filt.groupby('Race')['Criminal Ad'].count()['Black'], df_filt.groupby('Race')['Criminal Ad'].count()['White']]
    gender_obs = [df.groupby('Sex')['Criminal Ad'].count()['Female'], df.groupby('Sex')['Criminal Ad'].count()['Male']]
    gender_successes = [df_filt.groupby('Sex')['Criminal Ad'].count()['Female'], df_filt.groupby('Sex')['Criminal Ad'].count()['Male']]
    
    #prep for chi-square test
    breakdown_regular = df.groupby('Group')['Criminal Ad'].count()
    breakdown_domain = df_filt.groupby('Group')['Criminal Ad'].count()
    cont_table = [[breakdown_domain['bf'], breakdown_regular['bf']], 
              [breakdown_domain['bm'], breakdown_regular['bm']], 
              [breakdown_domain['wf'], breakdown_regular['wf']], 
              [breakdown_domain['wm'], breakdown_regular['wm']]]
    
    #both two-prop z tests for difference between just gender and just racial groups
    racial_p = two_prop_z_test(racial_successes, racial_obs)
    if racial_p < 0.05:
        print(f"    p-val for difference in just racial grouping: {round(racial_p, 4)} -- reject")
    else:
        print(f"    p-val for difference in just racial grouping: {round(racial_p, 4)} -- fail to reject")
    gender_p = two_prop_z_test(gender_successes, gender_obs)
    if gender_p < 0.05:
        print(f"    p-val for difference in just gender grouping: {round(gender_p, 4)} -- reject")
    else:
        print(f"    p-val for difference in just gender grouping: {round(gender_p, 4)} -- fail to reject")
        
    #chi-square test for combined racial + gender groupings
    chi2, p, dof, expected = stats.chi2_contingency(cont_table)
    if p < 0.05:
        print(f"    p-val for difference in combined gender + racial grouping: {round(p, 4)} -- reject\n")
    else:
        print(f"    p-val for difference in combined gender + racial grouping: {round(p, 4)} -- fail to reject\n")

Ad Domain: publicrecords.info
1632 ads from this domain
    p-val for difference in just racial grouping: 0.5336 -- fail to reject
    p-val for difference in just gender grouping: 0.5252 -- fail to reject
    p-val for difference in combined gender + racial grouping: 0.4212 -- fail to reject

Ad Domain: spokeo.com
1457 ads from this domain
    p-val for difference in just racial grouping: 0.0 -- reject
    p-val for difference in just gender grouping: 0.1017 -- fail to reject
    p-val for difference in combined gender + racial grouping: 0.0 -- reject

Ad Domain: recordsquarry.com
649 ads from this domain
    p-val for difference in just racial grouping: 0.0003 -- reject
    p-val for difference in just gender grouping: 0.0001 -- reject
    p-val for difference in combined gender + racial grouping: 0.0 -- reject



### Breakdown

In [97]:
df.groupby('Race')['Criminal Ad'].count()

Race
Black    3675
White    3307
Name: Criminal Ad, dtype: int64

In [98]:
df.groupby('Group')['Criminal Ad'].count()

Group
bf    1875
bm    1800
wf    1694
wm    1613
Name: Criminal Ad, dtype: int64

In [100]:
# Show breakdowns
display(only_criminal.groupby(['Race'])['Criminal Ad'].count().to_frame())
display(only_criminal.groupby(['Race', 'Sex'])['Criminal Ad'].count().to_frame())

Unnamed: 0_level_0,Criminal Ad
Race,Unnamed: 1_level_1
Black,62
White,82


Unnamed: 0_level_0,Unnamed: 1_level_0,Criminal Ad
Race,Sex,Unnamed: 2_level_1
Black,Female,26
Black,Male,36
White,Female,36
White,Male,46
