Skip to content

Commit

Permalink
remove scraping
Browse files Browse the repository at this point in the history
Part of migration from local Scrapy to Plucker and Apify.
  • Loading branch information
honzajavorek committed Jan 23, 2024
1 parent 6746391 commit bccd685
Show file tree
Hide file tree
Showing 73 changed files with 116 additions and 20,057 deletions.
7 changes: 0 additions & 7 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,6 @@ jobs:
- ./node_modules
- restore_cache:
key: sync-v1-{{ .Branch }}-{{ checksum ".today" }}
- restore_cache:
key: jobs-v2-{{ .Branch }}
- restore_cache:
key: images-v1-{{ .Branch }}
- run:
Expand Down Expand Up @@ -177,11 +175,6 @@ jobs:
key: sync-v1-{{ .Branch }}-{{ checksum ".today" }}
paths:
- .sync_cache
- .scrapy
- save_cache:
key: jobs-v2-{{ .Branch }}-{{ checksum ".today" }}
paths:
- juniorguru/data/jobs
- save_cache:
key: images-v1-{{ .Branch }}-{{ .Revision }}
paths:
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ google_service_account.json
# Synced data
/juniorguru/data/*.db
/juniorguru/data/*.db-*
/juniorguru/data/jobs/
/juniorguru/images/avatars-club/*.png
/juniorguru/images/logos-jobs/*.png
/juniorguru/images/posters-events/*.png
Expand Down

This file was deleted.

7 changes: 0 additions & 7 deletions juniorguru/lib/repr.py

This file was deleted.

42 changes: 0 additions & 42 deletions juniorguru/lib/scrapers.py

This file was deleted.

42 changes: 0 additions & 42 deletions juniorguru/lib/url_params.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse


Expand All @@ -18,44 +17,3 @@ def strip_params(url, param_names):

def strip_utm_params(url):
return strip_params(url, UTM_PARAM_NAMES)


def set_params(url, params):
parts = urlparse(url)
url_params = {name: value for name, value in parse_qs(parts.query).items()}
for name, value in params.items():
url_params[name] = ["" if value is None else str(value)]
query = urlencode(url_params, doseq=True)
return urlunparse(parts._replace(query=query))


def get_param(url, param_name):
parts = urlparse(url)
values = parse_qs(parts.query).get(param_name, [])
return values[0] if values else None


def increment_param(url, param_name, inc=1):
parts = urlparse(url)
params = parse_qs(parts.query)
params.setdefault(param_name, ["0"])
params[param_name] = str(int(params[param_name][0]) + inc)
query = urlencode(params, doseq=True)
return urlunparse(parts._replace(query=query))


def replace_in_params(url, s, repl, case_insensitive=False):
parts = urlparse(url)
params = parse_qs(parts.query)

if case_insensitive:
replace = lambda value: re.sub(re.escape(s), repl, value, flags=re.I)
else:
replace = lambda value: value.replace(s, repl)

params = {
param_name: [replace(value) for value in values]
for param_name, values in params.items()
}
query = urlencode(params, doseq=True)
return urlunparse(parts._replace(query=query))
3 changes: 0 additions & 3 deletions juniorguru/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from functools import wraps
from pathlib import Path

import scrapy
from czech_sort import bytes_key as czech_sort_key
from peewee import (
Check,
Expand Down Expand Up @@ -69,8 +68,6 @@ def __init__(self, *args, **kwargs):

def json_dumps(value):
def default(o):
if isinstance(o, scrapy.Item):
return dict(o)
if isinstance(o, Set):
return list(o)
try:
Expand Down
46 changes: 23 additions & 23 deletions juniorguru/sync/jobs_scraped/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
postprocess_jobs,
process_paths,
)
from juniorguru.sync.scrape_jobs.settings import FEEDS_DIR


PREPROCESS_PIPELINES = [
Expand All @@ -39,25 +38,26 @@
@click.option("--reuse-db/--no-reuse-db", default=False)
@click.option("--latest-seen-on", default=None, type=date.fromisoformat)
def main(reuse_db, latest_seen_on):
paths = list(Path(FEEDS_DIR).glob("**/*.jsonl.gz"))
logger.info(f"Found {len(paths)} .json.gz paths")

with db.connection_context():
if reuse_db:
logger.warning("Reusing of existing jobs database is enabled!")
try:
latest_seen_on = ScrapedJob.latest_seen_on()
logger.info(f"Last jobs seen on: {latest_seen_on}")
except OperationalError:
logger.warning("Jobs database not operational!")

if latest_seen_on:
paths = filter_relevant_paths(paths, latest_seen_on)
logger.info(f"Keeping {len(paths)} relevant .json.gz paths")
else:
logger.info("Not reusing jobs database")
ScrapedJob.drop_table()
ScrapedJob.create_table()

process_paths(paths, PREPROCESS_PIPELINES)
postprocess_jobs(POSTPROCESS_PIPELINES)
raise NotImplementedError()
# paths = list(Path(FEEDS_DIR).glob("**/*.jsonl.gz"))
# logger.info(f"Found {len(paths)} .json.gz paths")

# with db.connection_context():
# if reuse_db:
# logger.warning("Reusing of existing jobs database is enabled!")
# try:
# latest_seen_on = ScrapedJob.latest_seen_on()
# logger.info(f"Last jobs seen on: {latest_seen_on}")
# except OperationalError:
# logger.warning("Jobs database not operational!")

# if latest_seen_on:
# paths = filter_relevant_paths(paths, latest_seen_on)
# logger.info(f"Keeping {len(paths)} relevant .json.gz paths")
# else:
# logger.info("Not reusing jobs database")
# ScrapedJob.drop_table()
# ScrapedJob.create_table()

# process_paths(paths, PREPROCESS_PIPELINES)
# postprocess_jobs(POSTPROCESS_PIPELINES)
8 changes: 7 additions & 1 deletion juniorguru/sync/jobs_submitted.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from datetime import date, timedelta
from urllib.parse import urlparse

import langdetect
from w3lib.html import remove_tags

from juniorguru.cli.sync import main as cli
from juniorguru.lib import google_sheets, loggers
from juniorguru.lib.google_coerce import (
Expand All @@ -24,7 +27,6 @@
from juniorguru.sync.jobs_scraped.pipelines.employment_types_cleaner import (
clean_employment_types,
)
from juniorguru.sync.scrape_jobs.pipelines.language_parser import parse_language


logger = loggers.from_path(__file__)
Expand Down Expand Up @@ -102,6 +104,10 @@ def coerce_record(record, today=None):
return data


def parse_language(description_html: str) -> str:
return langdetect.detect(remove_tags(description_html))


def parse_locations(value):
if value:
return [loc.strip() for loc in re.split(r"\snebo\s", value)]
Expand Down
68 changes: 0 additions & 68 deletions juniorguru/sync/scrape_jobs/__init__.py

This file was deleted.

0 comments on commit bccd685

Please sign in to comment.