In [8]:
#Import Libraries
import pandas as pd
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


In [4]:
# Set up chrome web driver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# set defaault url
url = "https://remoteok.com/remote-data-scientist-jobs"
driver.get(url)

# Wait for jobs to load
wait = WebDriverWait(driver, 15)
job_rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.job")))

# Scroll to load more
for _ in range(3):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

jobs = []
for row in driver.find_elements(By.CSS_SELECTOR, "tr.job"):
    try:
        title = row.find_element(By.TAG_NAME, "h2").text
        company = row.find_element(By.TAG_NAME, "h3").text
        date_posted = row.find_element(By.TAG_NAME, "time").get_attribute("datetime")

        # To get the tags
        tags = [tag.text for tag in row.find_elements(By.CSS_SELECTOR, ".tag")]
        
        # Extract job locations and salaries
        locations = row.find_elements(By.CSS_SELECTOR, ".location")
        location = locations[0].text if len(locations) > 0 else None
        salary = locations[1].text if len(locations) > 1 else None
        
        jobs.append({
            "title": title,
            "company": company,
            "date_posted": date_posted,
            "tags": tags,
            "location": location,
            "salary": salary
        })
    except:
        pass

driver.quit()

# Create dataFrame 
df = pd.DataFrame(jobs)

# Export data to a csv file
df.to_csv("remoteok_jobs.csv", index=False)


In [12]:
df.head(11)

Unnamed: 0,title,company,date_posted,tags,location,salary
0,$1000 Weekly Work from Home,MasterCraft Boating Company,2025-08-21T01:14:09+00:00,"[Sales, Customer Support, Admin, , , , , , ]",🇺🇸 United States,💰 $120k - $160k
1,Senior Full Stack Engineer,Aguru UK,2025-09-06T15:10:56+00:00,"[Engineer, Developer, Technical, ]",🇪🇺 Europe,🇬🇧 United Kingdom
2,Senior DevOps Engineer,Aguru UK,2025-09-06T14:38:18+00:00,"[Engineer, Technical, Backend, ]",🇪🇺 Europe,🇬🇧 United Kingdom
3,Social Media Manager & Shitposter @ Fun Ecom Co,JLS Trading Co,2025-09-06T11:34:45+00:00,"[Marketing, Full Time, Remote, ]",🌏 Worldwide,💰 $20k - $60k
4,Mid level Backend Dev for SaaS the music industry,MelodyIQ,2025-09-05T06:16:35+00:00,"[Developer, JavaScript, React, , , ]",🌏 Worldwide,💰 $80k - $100k
5,Online Data Research with Bonus,TELUS Digital,2025-09-04T08:55:40+00:00,"[Entry-Level, Computer, Online, , , , , , , ]",🇺🇸 United States,💰 $10k - $20k
6,Python Developer,Ampcontrol,2025-08-27T15:24:06+00:00,"[Developer, Python, English, , ]",🇪🇺 Europe,💃 Latin America
7,UX Engineer,Baymard Institute,2025-08-26T08:36:58+00:00,"[Front End, JavaScript, React, , , , , ]",🇪🇺 Europe 🔒,🇬🇧 United Kingdom 🔒
8,DeFi Analyst and Educator,Decentralized Masters,2025-08-22T13:22:49+00:00,"[Edu, Teaching, Crypto, ]",🌏 Worldwide,💰 $40k - $100k
9,Engineering Lead,Aragon,2025-08-28T12:00:03+00:00,"[Design, Osx, Web3, , , , , , , , , , , , , ]",🌏 Worldwide,💰 $60k - $140k*


## Clean and flatten the tags

In [None]:
# Ensure tag is always a list
df["tags"] = df["tags"].apply(lambda x: x if isinstance(x, list) else [])

# Split tags that are joined in one string by commas
cleaned_tags = []
for row in df["tags"]:
    if isinstance(row, list):
        for tag in row:
            # strip on commas and strip spaces
            for t in tag.split(","):
                t = t.strip()
                if t:
                    cleaned_tags.append(t)

# Put flattened skills into a new DataFrame
skills_df = pd.DataFrame(cleaned_tags, columns=["skill"])

# Normalize (lowercase, remove spcial characters)
skills_df["skill"] = (skills_df["skill"]
    .str.lower()
    .str.replace(r"[^a-zA-Z0-9+ ]", "", regex = True)
)
skills_df.head()

In [11]:
skills_df.nunique()

skill    91
dtype: int64

## Parse salary column

In [13]:
def parse_salary(s):
    """
    parse salary column and separate minimum and maximum salary to new columns
    Parameters:
    s - salary to parse

    Returns parsed salary
    
    """

    if not s or not isinstance(s, str):
        return None, None

    # Remove emojis and spaces
    s = s.replace("💰", "").strip()

    # Find numbers with k or digits
    match = re.findall(r"(\d+)[kK]", s)
    if len(match) >= 2:
        return int(match[0])* 1000, int(match[1]) * 1000
    elif len(match) == 1:
        return int(match[0])* 1000, None

df[["salary_min", "salary_max"]] = df["salary"].apply(lambda x: pd.Series(parse_salary(x)))

In [26]:
df.head()

Unnamed: 0,company,date_posted,tags,salary_min,salary_max,location_clean,title_clean
0,MasterCraft Boating Company,2025-08-21T01:14:09+00:00,"[Sales, Customer Support, Admin, , , , , , ]",120000.0,160000.0,United States,1000 Weekly Work from Home
1,Aguru UK,2025-09-06T15:10:56+00:00,"[Engineer, Developer, Technical, ]",,,Europe,Senior Full Stack Engineer
2,Aguru UK,2025-09-06T14:38:18+00:00,"[Engineer, Technical, Backend, ]",,,Europe,Senior DevOps Engineer
3,JLS Trading Co,2025-09-06T11:34:45+00:00,"[Marketing, Full Time, Remote, ]",20000.0,60000.0,Worldwide,Social Media Manager Shitposter Fun Ecom Co
4,MelodyIQ,2025-09-05T06:16:35+00:00,"[Developer, JavaScript, React, , , ]",80000.0,100000.0,Worldwide,Mid level Backend Dev for SaaS the music industry


In [23]:
# Preprocess
def preprocess(df):
    # Normalize location
    df["location_clean"] = df["location"].str.replace(r"[^\w\s]", "", regex = True).str.strip()

    # Clean title
    df["title_clean"] = df["title"].str.replace(r"[^\w\s]", "", regex = True).str.strip()

    # drop cols 
    df.drop(columns = ["title", "location", "salary"], inplace = True)
    
    return df

In [29]:
df_clean = preprocess(df)

In [30]:
df_clean.head()

Unnamed: 0,company,date_posted,tags,salary_min,salary_max,location_clean,title_clean
0,MasterCraft Boating Company,2025-08-21T01:14:09+00:00,"[Sales, Customer Support, Admin, , , , , , ]",120000.0,160000.0,United States,1000 Weekly Work from Home
1,Aguru UK,2025-09-06T15:10:56+00:00,"[Engineer, Developer, Technical, ]",,,Europe,Senior Full Stack Engineer
2,Aguru UK,2025-09-06T14:38:18+00:00,"[Engineer, Technical, Backend, ]",,,Europe,Senior DevOps Engineer
3,JLS Trading Co,2025-09-06T11:34:45+00:00,"[Marketing, Full Time, Remote, ]",20000.0,60000.0,Worldwide,Social Media Manager Shitposter Fun Ecom Co
4,MelodyIQ,2025-09-05T06:16:35+00:00,"[Developer, JavaScript, React, , , ]",80000.0,100000.0,Worldwide,Mid level Backend Dev for SaaS the music industry


In [15]:
# Normalize location
df["location_clean"] = df["location"].str.replace(r"[^\w\s]", "", regex = True).str.strip()

In [17]:
# Clean title
df["title_clean"] = df["title"].str.replace(r"[^\w\s]", "", regex = True).str.strip()