Skip to content

Commit

Permalink
Merge pull request #24 from jazzify/feature/auto-scrape
Browse files Browse the repository at this point in the history
Create a command to scrape news platforms, also fix some dependencies
  • Loading branch information
jazzify committed Feb 5, 2020
2 parents 5c6ffc4 + 584a0db commit bcaa0c8
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 20 deletions.
7 changes: 6 additions & 1 deletion common/docs/source/scraper.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,9 @@ TODO

Scraper views
--------------------
``PostViewSet`` will handle our ``posts`` methods logic, it haves all the methods inherited from ``ModelViewSet`` and also has ``scrape_websites`` which is gonna be used to scrape every post on the news platforms.
``PostViewSet`` will handle our ``posts`` methods logic, it haves all the methods inherited from ``ModelViewSet``.


Scraper commands
--------------------
``scrape`` command is gonna be used to scrape every post on the news platforms using the ``ScraperManager`` is gonna be used as a cronjob
2 changes: 0 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
PyYAML==5.1.2
coreapi==2.3.3
black==19.3b0
pylint==2.4.2
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,7 @@ websockets==8.0.2
wrapt==1.11.2
psycopg2==2.8.4
django-debug-toolbar==2.2
PyYAML==5.1.2
coreapi==2.3.3
django_heroku
gunicorn
Empty file.
11 changes: 11 additions & 0 deletions scraper/management/commands/scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from django.core.management.base import BaseCommand, CommandError
from scraper.services import ScraperManager

class Command(BaseCommand):
help = 'Scrape the news platforms'

def handle(self, *args, **options):
scraper_manager = ScraperManager()
errors = scraper_manager.run_requests()

self.stdout.write(self.style.ERROR(f'Scraping errors "{errors}"'))
15 changes: 11 additions & 4 deletions scraper/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,15 @@ def _parse_eltiempo(self, request, post, base_url):
soup = BeautifulSoup(request.text, "html.parser")

# Cover image
cover_url = soup.select_one(
"figure.foto-seccion-home.image-container"
).select_one("img")["data-original"]
try:
cover_url = soup.select_one(
"figure.foto-seccion-home.image-container"
).select_one("img")["data-original"]
except AttributeError:
cover_url = soup.select_one(
"figure.foto-seccion-home-video.image-container"
).select_one("img")["data-original"]

post.cover_img_url = f"{base_url}{cover_url}"

# Principal post anchor
Expand Down Expand Up @@ -116,9 +122,10 @@ def _parse_elpais(self, request, post, base_url):
# Subtitle
post.subtitle = post_soup.select_one("h2.articulo-subtitulo").string
# Cover Img
post_img = post_soup.select_one("#articulo_contenedor > figure > img")[
post_img = post_soup.select_one("figure.foto > img")[
"data-src"
]
print(post_img)
post.cover_img_url = f"{base_url}{post_img}"
# Author
post.author = post_soup.select_one("span.autor-nombre > a").string
Expand Down
19 changes: 6 additions & 13 deletions scraper/views.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,16 @@
from django.shortcuts import render
from rest_framework import viewsets, status
from rest_framework.decorators import action
from rest_framework import viewsets
from rest_framework.response import Response

from scraper.models import Post
from scraper.serializers import PostSerializer
from scraper.services import ScraperManager


class PostViewSet(viewsets.ModelViewSet):
queryset = Post.objects.all()[:10]
queryset = Post.objects.all()
serializer_class = PostSerializer

@action(detail=False, methods=["get"])
def scrape_websites(self, request):
scraper_manager = ScraperManager()
errors = scraper_manager.run_requests()
def list(self, request, *args, **kwargs):
queryset = self.get_queryset()[:10]

if errors:
return Response({"errors": errors}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
else:
return Response(status=status.HTTP_200_OK)
serializer = self.get_serializer(queryset, many=True)
return Response(serializer.data)

0 comments on commit bcaa0c8

Please sign in to comment.