In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import time
import random

BASE_URL = "https://remote.co/remote-jobs/developer"
user_agent = UserAgent()

all_job_links = []

# ---------------- Step 1: Fetch all job links with retries ----------------
def fetch_links(url, retries=3):
    for attempt in range(retries):
        try:
            resp = requests.get(url, headers={"User-Agent": user_agent.random}, timeout=15)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.content, "html.parser")
                return ["https://remote.co" + a["href"] for a in soup.select("a[href*='/job-details/']")]
        except:
            time.sleep(random.uniform(1, 2))
    return []

page_urls = [f"{BASE_URL}?page={i}" if i > 1 else BASE_URL for i in range(1, 28)]
with ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(fetch_links, url) for url in page_urls]
    for future in as_completed(futures):
        links = future.result()
        if links:
            all_job_links.extend(links)
            print(f"Found {len(links)} links from a page. Total so far: {len(all_job_links)}")

print(f"\nStep 1 complete. Total job links found: {len(all_job_links)}\n")

# ---------------- Step 2: Scrape job details with retries ----------------
all_jobs = []

def fetch_job_data(job_url, retries=3):
    for attempt in range(retries):
        try:
            resp = requests.get(job_url, headers={"User-Agent": user_agent.random}, timeout=15)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.content, "html.parser")
                
                data = {
                    "Title": soup.select_one("h1").get_text(strip=True) if soup.select_one("h1") else "",
                    "Company": soup.select_one("h3").get_text(strip=True) if soup.select_one("h3") else "",
                    "Remote Work Level": "",
                    "Location": "",
                    "Salary": "",
                    "Job Type": "",
                    "Job Schedule": "",
                    "Career Level": "",
                    "Education Level": "",
                    "Categories": "",
                    "URL": job_url
                }

                for li in soup.select("ul#detail-list-wrapper li"):
                    labels = li.find_all("p")
                    if len(labels) >= 2:
                        key = labels[0].get_text(strip=True).rstrip(":")
                        val = labels[1].get_text(strip=True)
                        if key in data:
                            data[key] = val

                return data
        except:
            time.sleep(random.uniform(1, 2))
    return None

with ThreadPoolExecutor(max_workers=100) as executor:
    futures = [executor.submit(fetch_job_data, url) for url in all_job_links]
    for i, future in enumerate(as_completed(futures), 1):
        job = future.result()
        if job:
            all_jobs.append(job)
        if i % 50 == 0:
            print(f"Scraped {i}/{len(all_job_links)} jobs")

print(f"\nStep 2 complete. Total jobs scraped: {len(all_jobs)}\n")

# ---------------- Step 3: Save to CSV ----------------
if all_jobs:
    with open("job_data1.csv", "w", newline="", encoding="utf-8") as f:
        keys = all_jobs[0].keys()
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(all_jobs)
    print(f"[+] Saved {len(all_jobs)} jobs to job_data1.csv")
else:
    print("No job data to save.")

Found 50 links from a page. Total so far: 50
Found 50 links from a page. Total so far: 100
Found 50 links from a page. Total so far: 150
Found 50 links from a page. Total so far: 200
Found 50 links from a page. Total so far: 250
Found 50 links from a page. Total so far: 300
Found 50 links from a page. Total so far: 350
Found 50 links from a page. Total so far: 400
Found 50 links from a page. Total so far: 450
Found 50 links from a page. Total so far: 500
Found 50 links from a page. Total so far: 550
Found 50 links from a page. Total so far: 600
Found 50 links from a page. Total so far: 650
Found 50 links from a page. Total so far: 700
Found 50 links from a page. Total so far: 750
Found 50 links from a page. Total so far: 800
Found 50 links from a page. Total so far: 850
Found 50 links from a page. Total so far: 900
Found 30 links from a page. Total so far: 930
Found 50 links from a page. Total so far: 980
Found 50 links from a page. Total so far: 1030
Found 50 links from a page. Total 

In [3]:
import os
print(os.getcwd())

C:\Users\haris
