In [2]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from io import StringIO
import requests

import time
import re

import pandas as pd
import json

import random

import os

import json

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

class LeverWebScraper:
    def __init__(self):
        self.urls = None
    
    def extract_urls(self, text):
        regex_pattern = r"(https?://[jobs.lever.co][^\s]+)"
        urls = re.findall(regex_pattern, text)
        return urls

    def remove_tags(self, html):
        # parse html content
        soup = BeautifulSoup(html, "html.parser")

        for data in soup(['style', 'script']):
            data.decompose()

        return ' '.join(soup.stripped_strings)
    
    def get_lever_sites(self):
        lever_gsheet = "https://docs.google.com/spreadsheets/d/18u2sKRKjKz9gwRyob0p9KmcyVC6NX8JaJhjOqsRmbKY/edit?usp=sharing"
        csv_url = lever_gsheet
        res = requests.get(url=csv_url)
        content = res.content#.decode('utf-8')

        content = self.remove_tags(content)
        #print("content")
        #print(content)
        urls = self.extract_urls(content)
        return urls
    
    def strip_tags(self, html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    
    
    def clean_job_posting(self, posting):
        text = self.strip_tags(posting)

        # Add spaces between consecutive uppercase letters and lowercase letters
        cleaned_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

        # Add spaces between consecutive letters and non-letters
        cleaned_text = re.sub(r'([a-zA-Z])([^a-zA-Z])', r'\1 \2', cleaned_text)
        cleaned_text = re.sub(r'([^a-zA-Z])([a-zA-Z])', r'\1 \2', cleaned_text)

        # Remove any remaining multiple spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

        cleaned_text = cleaned_text.replace("( ", "")
        cleaned_text = cleaned_text.replace(" )", "")


        if cleaned_text.startswith('Apply'):
            cleaned_text = cleaned_text.replace('Apply', '')

        if cleaned_text.endswith('Remote'):
            cleaned_text = '[REMOTE] - ' + cleaned_text
            cleaned_text = cleaned_text.replace('Remote', '')  
        elif cleaned_text.endswith('On-site'):
            cleaned_text = '[ON-SITE] - ' + cleaned_text
            cleaned_text = cleaned_text.replace('On-site', '')            
        elif cleaned_text.endswith('Hybrid'):
            cleaned_text = '[HYBRID] - ' + cleaned_text
            cleaned_text = cleaned_text.replace('Hybrid', '')

        if "Full - Time" in cleaned_text:
            cleaned_text = '[Full-Time]' + cleaned_text
            cleaned_text = cleaned_text.replace("Full - Time", "")
        elif "Part - time" in cleaned_text:
            cleaned_text = '[Part-Time]' + cleaned_text
            cleaned_text = cleaned_text.replace("Part - time", "")        

        cleaned_text = cleaned_text.replace("  ", " ")


        return cleaned_text

        all_postings = list()

        map0 = {
            'urls': [],
            'descriptions': [],
            'points': []
        }

    def get_postings(self):

        all_postings = list()

        map0 = {
            'urls': [],
            'descriptions': [],
            'points': []
        }
        links = self.get_lever_sites()
        links = [l + "/" if not l.endswith("/") else l for l in links ]
        links = [l for l in links if ".eu." not in l] # Not looking for work in Europe (yet)

        links = list(set(links))

        random.shuffle(links)
        random.shuffle(links)
        random.shuffle(links)

        current_link_n = 1
        n_links = len(links)

        #if True:
        #    for l in links:
        #        print(l)
        #    return

        for current_url in links:
            #print("Link {0}/{1}".format(current_link_n, n_links))
            current_link_n += 1

            req = Request(current_url, headers={'User-Agent': 'Mozilla/5.0'})
            try:
                html_page = urlopen(req).read()
            except HTTPError as e:
                if e.code == 404:
                    print("HTTP 404 error: Page not found")
                else:
                    print("An HTTP error occurred:", e)
                continue

            soup = BeautifulSoup(html_page, 'html.parser')
            postings = soup.find_all("div", {"class": "posting"})
            for posting in postings:

                clean_posting = self.clean_job_posting(str(posting))
                clean_posting = clean_posting.replace('Sr . ', 'Sr. ')
                points = 0

                if "intern" in clean_posting.lower():
                    points += 2
                if "Data" in clean_posting:
                    points += 1
                if "Analyst" in clean_posting:
                    points += 0.5
                if "REMOTE" in clean_posting:
                    points += 1
                if "Business Analyst" in clean_posting:
                    points += 0.5
                if "Business Intelligence" in clean_posting:
                    points += 1
                if "Data Scientist" in clean_posting:
                    points += 1
                if "Machine Learning" in clean_posting:
                    points += 1          

                skip_posting = False

                data_keyword_prsent = "data" in clean_posting.lower()
                science_keyword_prsent = "science" in clean_posting.lower() or "scientist" in clean_posting.lower()
                data_science_referenced = data_keyword_prsent and science_keyword_prsent

                machine_learning_referenced = "machine" in clean_posting.lower() and "learning" in clean_posting.lower()

                data_analyst_referenced = "data" in clean_posting.lower() and "analyst" in clean_posting.lower()

                onsite_keywords = ["on - site", "onsite"]
                foreign_locations = ["UKLondon", "vienna", "istanbul", "singapore", "philippines", "australia", "santiago", "mumbai"]
                foreign_locations += ["berlin", "bengaluru"]
                senority_keywords = ["head of", "senior", "sr", "director", "lead"]
                keywords_to_exclude = onsite_keywords + foreign_locations + senority_keywords

                for w in keywords_to_exclude:
                    if w in clean_posting.lower():
                        skip_posting = True
                        break


                if not (data_science_referenced or machine_learning_referenced or data_analyst_referenced): #and intern_keyword_present
                    skip_posting = True

                if skip_posting:
                    continue


                map0['urls'].append(current_url)
                map0['descriptions'].append(clean_posting)
                map0['points'].append(points)      
                print(current_url, '\t', clean_posting)


                #if clean_posting.startswith('[REMOTE]'):

            #print('-'*40)
            time.sleep(1.0)
        return map0

lever_scraper = LeverWebScraper()
#lever_sites = lever_scraper.get_lever_sites()
#for l in lever_sites:
#    print(l)
postings_map = lever_scraper.get_postings()

https://jobs.lever.co/gotit/ 	  Machine Learning Engineer / Scientist Burlingame , CAAIFull - time
https://jobs.lever.co/gotit/ 	 [REMOTE] - Machine Learning Internship Palo Alto , CAAIIntern 
https://jobs.lever.co/concurrency/ 	 [REMOTE] - Data Scientist United States Data & AIFull - time 


KeyboardInterrupt: 

In [16]:
pd.set_option('display.max_rows', 500)
df = pd.DataFrame(postings_map)
df = df.sort_values(by=['points'], ascending=False)

from pandas import option_context

with option_context('display.max_colwidth', 400):
    display(df.head(50))

Unnamed: 0,urls,descriptions,points
53,https://jobs.lever.co/gotit/,"[REMOTE] - Machine Learning Internship Palo Alto , CAAIIntern",4.0
95,https://jobs.lever.co/overbond/,[REMOTE] - Data Science Intern - May 2023 PEY / Co - op Canada Data Science Full - time,4.0
51,https://jobs.lever.co/dott/,[HYBRID] - Data Analyst intern Amsterdam Data – Data Internship,3.5
17,https://jobs.lever.co/smartadserver/,[HYBRID] - Data Analyst Intern - Analytics Lab Paris Analytics Internship,3.5
23,https://jobs.lever.co/tokenmetrics/,[Full-Time][REMOTE] - Crypto Data Scientist India Hyderabad Data Science Team,3.0
25,https://jobs.lever.co/tokenmetrics/,[Full-Time][REMOTE] - Crypto Data Scientist Krakow Data Science Team,3.0
27,https://jobs.lever.co/tokenmetrics/,[Full-Time][REMOTE] - Crypto Data Scientist Columbo Data Science Team,3.0
28,https://jobs.lever.co/tokenmetrics/,[Full-Time][REMOTE] - Crypto Data Scientist Dhaka Data Science Team,3.0
29,https://jobs.lever.co/tokenmetrics/,[Full-Time][REMOTE] - Crypto Data Scientist Kinshasa Data Science Team,3.0
32,https://jobs.lever.co/concurrency/,[REMOTE] - Data Scientist United States Data & AIFull - time,3.0
