In [1]:
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import os

load_dotenv()

True

In [2]:
def web_scrape_api_call(url_to_scrape):
    '''
    Sends the url that we would like to scrape to the webscrapingapi
    so that our calls can be anonymized. 
    '''
    url = "https://api.webscrapingapi.com/v1"
    params = {
        "api_key": os.getenv("WEBSCRAPING_API_KEY"),
        "url": url_to_scrape,
        "render_js": 1,
        "proxy_type": "datacenter",
        "wait_until": "networkidle0",
        "country": "us",
        "json_response": 1,
    }
    response = requests.get(url, params=params)
    return response

def extract_text(response):
    '''
    Extracts the text from all div tags with class "b-section b-section--fixed-width"
    in the response content using BeautifulSoup.
    '''
    soup = BeautifulSoup(response.content, 'html.parser')
    divs = soup.find_all('div', class_='b-section b-section--fixed-width')
    target_text = ''
    if divs:
        for div in divs:
            target_text += div.get_text()
    return target_text


In [3]:
url = "https://zety.com/resume-examples"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
main_div = soup.find(class_="grid__column--phone--4 grid__column--desktop--9")
titles = main_div.find_all('h2', class_="box title")
hrefs = main_div.find_all('a', href=True)

title_list = [title.text for title in titles]
href_list = [a['href'] for a in hrefs if "-resume-example" in a['href']]
url_list = [f"https://zety.com{href}" for href in href_list]

In [13]:
import concurrent.futures

# Assuming 'urls' is your list of URLs
data = []
counter = 0
error_counter = 0
list_of_urls = url_list[200:210]
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    responses = list(executor.map(web_scrape_api_call, list_of_urls))
    texts = list(executor.map(extract_text, responses))
    for url, response, text in zip(list_of_urls, responses, texts):
        if response.status_code == 400:
            error_counter += 1
        if error_counter > 50:
            print("More than 50 400 responses received. Stopping...")
            break
        data.append({
            'href': url,
            'response': response.content,
            'extracted_text': text
        })
        counter += 1
        if counter % 50 == 0:
            print(f"Current href: {url}")
            print(f"First 20 characters of extracted text: {text[:20]}")

In [18]:
# check if each response is the same
responses[2].content == responses[4].content

False

[{'href': 'https://zety.com/blog/restaurant-manager-resume-example',
  'response': b'{\n  "headers": {\n    "access-control-expose-headers": "X-App-Release",\n    "alt-svc": "h3=\\":443\\"; ma=93600",\n    "cache-control": "public, max-age=345600",\n    "content-encoding": "gzip",\n    "content-type": "text/html; charset=utf-8",\n    "date": "Thu, 09 Nov 2023 18:46:27 GMT",\n    "etag": "\\"6fe76-iMN1VvlN88wa51+/4z5nkkStl/0\\"",\n    "expires": "Mon, 13 Nov 2023 18:46:27 GMT",\n    "referrer-policy": "no-referrer-when-downgrade",\n    "set-cookie": "x-georegion=285,US,TX,DALLAS,623,1920,1922,214+972,DALLAS,32.7874,-96.7989,CST,75201-75212+75214-75238+75240-75244+75246-75254+75260-75267+75270+75275+75277+75283-75285+75287+75301+75303+75312-75313+75315+75320+75326+75336+75339+75342+75354-75360+75367-75368+75370-75374+75376+75378-75382+75389-75395+75397-75398,NA,,,62874,vhigh,5000; path=/\\nak_bmsc=2C83C5F047AEE31B65C6FA4EA1FB7D35~000000000000000000000000000000~YAAQMzovFx/nVrGLAQAAPjZotRU

In [11]:
list_of_urls

['https://zety.com/blog/production-coordinator-resume-example',
 'https://zety.com/blog/senior-graphic-designer-resume-example',
 'https://zety.com/blog/social-media-coordinator-resume-example',
 'https://zety.com/blog/stage-manager-resume-example',
 'https://zety.com/blog/technical-writer-resume-example',
 'https://zety.com/blog/theater-resume-example',
 'https://zety.com/blog/translator-resume-example',
 'https://zety.com/blog/ux-designer-resume-and-ui-developer-resume-examples',
 'https://zety.com/blog/video-editor-resume-example',
 'https://zety.com/blog/video-producer-resume-example']

In [None]:
import pickle
with open('resume_texts.pkl', 'wb') as f:
    pickle.dump(resume_texts, f)

In [56]:
import json

def advanced_web_scrape_api_call(url_to_scrape):
    '''
    Sends the url that we would like to scrape to the webscrapingapi
    with advanced search parameters for more specific scraping. 
    '''
    url = "https://api.webscrapingapi.com/v1"
    extract_rules = {
        "resume": {
            "selector": ".b-section.b-section--fixed-width",
            "output": "text"
        }
    }
    params = {
        "api_key": os.getenv("WEBSCRAPING_API_KEY"),
        "url": url_to_scrape,
        "render_js": 1,
        "proxy_type": "datacenter",
        "wait_until": "networkidle0",
        "extract_rules": json.dumps(extract_rules),
        "country": "us",
        "json_response": 1,
    }
    try:
        response = requests.get(url, params=params)
    except:
        "no resume found"
    return response

def extract_resume_from_response(response):
    '''
    Extracts the resume from the response content.
    '''
    response_json = response.json()
    body_json = json.loads(response_json['body'])
    resume = body_json['resume']
    return resume

In [54]:
new_test = advanced_web_scrape_api_call(url_list[303])
new_test_extract = extract_resume_from_response(new_test)

In [62]:
new_test.status_code

200

In [55]:
new_test_extract

['Linda E. HubbardCompTIA A+ Certified IT Support Technicianlinda.hubbard@gmail.com901-833-6388linkedin.com/in/lindahubbard\xa0Summary of Qualifications\xa0Personable and knowledgeable IT support technician with over 4 years of experience assisting customers with various hardware and software related issues. Provided in-depth technical support to clients at a Tier 2 level, solving 99.2% of issues without transferring to Tier 3 support. Seeking to provide expert technical support to enterprise organizations as the L3 tech support engineer at QuantX Group.\xa0Work Experience\xa0IT Technical Support Specialist L2/Tier 2January 2017–November 2019Jones Software Group, Memphis, TNKey Qualifications & ResponsibilitiesCoordinated with Level 1 technical support specialists to take over calls outside their level of support.Assisted customers with more difficult technical issues requiring a greater level of personalized care and in greater length.Escalated support desk tickets to Level 3 in the m

In [44]:
test_url_list = url_list[300:310]

In [65]:
import concurrent.futures

def process_url(url):
    '''
    Calls the advanced_web_scrape_api_call function and extracts the resume for a given URL.
    '''
    print(f"Processing {url}")
    response = advanced_web_scrape_api_call(url)
    if response.status_code == 200:
        resume = extract_resume_from_response(response)
    else:
        resume = None
    print(f"Finished processing {url}")
    return url, resume

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(process_url, url_list))

url_resume_dict = dict(results)

Processing https://zety.com/blog/accounting-resume-exampleProcessing https://zety.com/blog/accounting-assistant-resume-example

Processing https://zety.com/blog/accounting-clerk-resume-example
Processing https://zety.com/blog/accounting-manager-resume-example
Processing https://zety.com/blog/accounts-payable-resume-example
Finished processing https://zety.com/blog/accounting-assistant-resume-example
Processing https://zety.com/blog/accounts-receivable-resume-example
Finished processing https://zety.com/blog/accounting-manager-resume-example
Processing https://zety.com/blog/accounts-receivable-resume-example
Finished processing https://zety.com/blog/accounting-resume-example
Processing https://zety.com/blog/actuary-resume-example
Finished processing https://zety.com/blog/accounts-payable-resume-example
Processing https://zety.com/blog/analyst-resume-example
Finished processing https://zety.com/blog/accounting-clerk-resume-example
Processing https://zety.com/blog/auditor-resume-example
F

In [75]:
import pickle

with open('url_resume_dict.pkl', 'wb') as f:
    pickle.dump(url_resume_dict, f)

In [72]:
# Print the first few items in the dictionary
for url, resume in list(filtered_dict.items())[:5]:
    print(f"URL: {url}")
    print(f"Resume: {resume}")
    print()

URL: https://zety.com/blog/accounting-resume-example
Resume: ['Gerard LevineCertified Public Accountant\xa0+1-123-456-7890gerardlevine@email.comlinkedin.com/in/gerardlevine\xa0Summary\xa0Certified Public Accountant with 5+ years of experience in ledger processes, account reconciliations and streamlining accounts. Seeking to assist NovaGen Corporation in creating optimized financial solutions. Reduced annual budget costs by 10% at Radiant Innovations by restructuring my team’s organization, optimizing their performance and time management.\xa0Experience\xa0Senior AccountantRadiant Innovations, Kenosha, WIJuly 2016–PresentKey Qualifications & ResponsibilitiesManaged a diverse portfolio of over 100 clients, providing comprehensive accounting solutions.Led a team of 5 junior accountants to drive all aspects of accounting, from reconciliation to finalizing annual accounts.Streamlined financial reporting processes, improving accuracy and efficiency.Restructured my team to increase efficiency

In [79]:
empty_urls = []
url_success_resume = {}

for url, resume in url_resume_dict.items():
    if resume is None:
        empty_urls.append(url)
    else:
        if isinstance(resume, list):
            resume = ' '.join(resume)
        url_success_resume[url] = resume

In [81]:
# lets check the length of url_success_resume
len(url_success_resume)

428

In [82]:
# turn url_success_resume into a dataframe
import pandas as pd

df = pd.DataFrame.from_dict(url_success_resume, orient='index', columns=['resume'])

In [83]:
df

Unnamed: 0,resume
https://zety.com/blog/accounting-resume-example,Gerard LevineCertified Public Accountant +1-12...
https://zety.com/blog/accounting-assistant-resume-example,Brian WatkinsAccounting Assistantbrian.watkins...
https://zety.com/blog/accounting-clerk-resume-example,Roman ArkellAccounting Clerk781-984-9624romanz...
https://zety.com/blog/accounting-manager-resume-example,Lukas Summerslukas.summers@mail.us646-622-2766...
https://zety.com/blog/accounts-payable-resume-example,Moe Moneymoemoneyzety@gmail.com555-6949-527 Ca...
...,...
https://zety.com/blog/preschool-teacher-resume-example,Jennifer DelgadoPreschool Teacher(989) 555-123...
https://zety.com/blog/high-school-teacher-resume-example,Nicole GoodmanHigh School Teachernicole.goodma...
https://zety.com/blog/gamestop-resume-example,Bill C. SmithHigh-School Senior(262) 945-0337b...
https://zety.com/blog/mall-santa-resume-example,2Mall Santa - Resume Summary or Resume Objecti...
