# Analysis Template
Purpose is to demonstrate how to use `script.py` and a notebook in conjunction for analysis.<br>
By: Jonathan Lo<br>
Date: 12/1/23


In [1]:
# Imports
import os
import re
import logging
import pandas as pd

from bs4 import BeautifulSoup
from collections import defaultdict

## Loading Data

### Standard `results.csv`

In [2]:
# Example
df = pd.read_csv('../results.csv').drop('Unnamed: 0', axis=1)
df.head(3)

Unnamed: 0,Name,Group,Ad Domain,Ad Title,Ad Link
0,Allison Smith,wf,police.us,Criminal Records Are Public | Enter Name- No R...,https://www.police.us.org/State/Records
1,Allison Smith,wf,propertyrecord.com,Public Property Records Search | County Proper...,https://www.propertyrecord.com/Property-Record...
2,Allison Smith,wf,criminalrecords.us,"Enter Name- No Records, No Fee | Find Public R...",https://www.criminalrecords.us.org/Criminal/Re...


### Utilizing `parse_bing_ads()`

In [3]:
def parse_bing_ads(raw_html, query):
    """ Parses the doman.TLD and the title, URL from the HTML
    """
    compiled = defaultdict(list)
    pattern = r'(?:http[s]?://)?(?:www\.)?([\w-]+\.[\w-]+)' # Matches the domain and TLD of a URL
    ads = BeautifulSoup(raw_html).select('.sb_add')
    for ad in ads:
        try:
            title = ad.select_one('h2').text
            link = ad.select_one('.b_adurl').text
            match = re.search(pattern, link)
            domain = match.group(1) if match else 'ERROR'
            compiled[domain].append((title, link))
        except Exception as e:
            logging.debug(f'Failed to parse ad HTML on query: {query}')
    return compiled

In [4]:
# Initialize 
path_to_raw_html = './../html/'
all_ads = [['Name', 'Group', 'Ad Domain', 'Ad Title', 'Ad Link']]

# Load HTML and pass into `parse_bing_ads()`
for html_path in os.listdir(path_to_raw_html):
    with open(os.path.join(path_to_raw_html, html_path), 'r') as f:
        # Parsing raw HTML
        group, name = html_path[:2], html_path[3:].replace('.html', '').replace('_', ' ')
        raw_html = f.read()
        parsed = parse_bing_ads(raw_html, name)

        # Saving for later use
        for domain, ad_items in parsed.items():
            for ad_opts in ad_items:
                all_ads.append([name, group, domain, ad_opts[0], ad_opts[1]])

In [5]:
# Example
df = pd.DataFrame(all_ads[1:], columns=all_ads[0])
df.head(3)

Unnamed: 0,Name,Group,Ad Domain,Ad Title,Ad Link
0,Hakim Jones,bm,peoplelooker.com,Search For HAKIM JONES | See Results For HAKIM...,https://www.peoplelooker.com/Public_Records/Pe...
1,Hakim Jones,bm,PublicRecordsNow.com,Hakim Jones (Found),http://www.PublicRecordsNow.com/Hakim Jones
2,Kareem Johnson,bm,peoplelooker.com,View Public Records Online | Search Now to Dis...,https://www.peoplelooker.com/Public_Records/Pe...


### Just loading the `BS4` parser for future querying

In [6]:
path_to_raw_html = './../html/'
all_ads = []

# Load HTML and pass into `parse_bing_ads()`
for html_path in os.listdir(path_to_raw_html):
    with open(os.path.join(path_to_raw_html, html_path), 'r') as f:
        # Parsing raw HTML
        group, name = html_path[:2], html_path[3:].replace('.html', '').replace('_', ' ')
        raw_html = f.read()

        # Saving bs4
        all_ads.append(BeautifulSoup(raw_html))

In [7]:
# Example
first_ad_of_queries = all_ads[0]
bs4_operation = first_ad_of_queries.select('.sb_add')
bs4_operation[0]

<div class="sb_add sb_adTA b_adscv" style="contain-intrinsic-size: 245px"><h2 class=""><a class="" h="ID=SERP,5599.1,Ads" href="https://www.bing.com/aclk?ld=e8tx2w_sWxZIa-mSrivXrrxjVUCUyV6j77wFnI_wxK7Ammrd5wiSPokeF34t-8mUcSZ2-bMvD9xn5RVE-esxJGUrEgOw-0HfAmePHSPnJh_o6cBunhTDIfMeX_ca_8FO--tCSrOAERjuvCd8z09FSXgqtOfLzzSpAH0veb2IUx6olhQiJkrKyFt_yUCMPR53AHCg-jXw&amp;u=aHR0cHMlM2ElMmYlMmZ3d3cucGVvcGxlbG9va2VyLmNvbSUyZiUzZmZuJTNkSEFLSU0lMjZtbiUyNmxuJTNkSk9ORVMlMjZ1dG1fc291cmNlJTNkYmluZyUyNnV0bV9tZWRpdW0lM2RjcGMlMjZ1dG1fY2FtcGFpZ24lM2RQTF9QUExfU0VBX1BSUF9OQU1fRnVsbF9GN18xODAwLTIxMDBfUGhyYXNlX0IlMjZ1dG1fdGVybSUzZEhBS0lNJTI1MjBKT05FUyUyNnV0bV9jb250ZW50JTNkJTI2bWF0Y2h0eXBlJTNkcCUyNmFkZ3JvdXAlM2QxMzM1OTA4MjU0MDY0NDA1JTI2ZGV2aWNlJTNkYyUyNmJfY2FtcGFpZ25pZCUzZDQzMTE0MTU4OSUyNmJfcHJvZHVjdGlkJTNkJTI2Yl90ZXJtaWQlM2Rrd2QtODM0OTUyNjEwMDIyMzYlM2Fsb2MtMTkwJTI2Yl9hZGdyb3VwaWQlM2QxMzM1OTA4MjU0MDY0NDA1JTI2Yl9jYW1wYWlnbiUzZFBMX1BQTF9TRUFfUFJQX05BTV9GdWxsX0Y3XzE4MDAtMjEwMF9QaHJhc2VfQiUyNmJfaXNwcm9kdWN0JTNkJTI2Yl90

## Analysis

In [None]:
# Your analysis here
...