Merge pull request #24 from jazzify/feature/auto-scrape

Create a command to scrape news platforms, also fix some dependencies
jazzify · Feb 5, 2020 · bcaa0c8 · bcaa0c8
2 parents 5c6ffc4 + 584a0db
commit bcaa0c8
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 20 deletions.
diff --git a/common/docs/source/scraper.rst b/common/docs/source/scraper.rst
@@ -44,4 +44,9 @@ TODO
 
 Scraper views 
 --------------------
-``PostViewSet`` will handle our ``posts`` methods logic, it haves all the methods inherited from ``ModelViewSet`` and also has ``scrape_websites`` which is gonna be used to scrape every post on the news platforms.
+``PostViewSet`` will handle our ``posts`` methods logic, it haves all the methods inherited from ``ModelViewSet``.
+
+
+Scraper commands
+--------------------
+``scrape`` command is gonna be used to scrape every post on the news platforms using the ``ScraperManager`` is gonna be used as a cronjob
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,2 @@
-PyYAML==5.1.2
-coreapi==2.3.3
 black==19.3b0
 pylint==2.4.2
diff --git a/requirements.txt b/requirements.txt
@@ -33,5 +33,7 @@ websockets==8.0.2
 wrapt==1.11.2
 psycopg2==2.8.4
 django-debug-toolbar==2.2
+PyYAML==5.1.2
+coreapi==2.3.3
 django_heroku
 gunicorn
diff --git a/scraper/management/commands/_private.py b/scraper/management/commands/_private.py
diff --git a/scraper/management/commands/scrape.py b/scraper/management/commands/scrape.py
@@ -0,0 +1,11 @@
+from django.core.management.base import BaseCommand, CommandError
+from scraper.services import ScraperManager
+
+class Command(BaseCommand):
+    help = 'Scrape the news platforms'
+
+    def handle(self, *args, **options):
+        scraper_manager = ScraperManager()
+        errors = scraper_manager.run_requests()
+
+        self.stdout.write(self.style.ERROR(f'Scraping errors "{errors}"'))
diff --git a/scraper/services.py b/scraper/services.py
@@ -59,9 +59,15 @@ def _parse_eltiempo(self, request, post, base_url):
         soup = BeautifulSoup(request.text, "html.parser")
 
         # Cover image
-        cover_url = soup.select_one(
-            "figure.foto-seccion-home.image-container"
-        ).select_one("img")["data-original"]
+        try:
+            cover_url = soup.select_one(
+                "figure.foto-seccion-home.image-container"
+            ).select_one("img")["data-original"]
+        except AttributeError:
+            cover_url = soup.select_one(
+                "figure.foto-seccion-home-video.image-container"
+            ).select_one("img")["data-original"]
+
         post.cover_img_url = f"{base_url}{cover_url}"
 
         # Principal post anchor
@@ -116,9 +122,10 @@ def _parse_elpais(self, request, post, base_url):
         # Subtitle
         post.subtitle = post_soup.select_one("h2.articulo-subtitulo").string
         # Cover Img
-        post_img = post_soup.select_one("#articulo_contenedor > figure > img")[
+        post_img = post_soup.select_one("figure.foto > img")[
             "data-src"
         ]
+        print(post_img)
         post.cover_img_url = f"{base_url}{post_img}"
         # Author
         post.author = post_soup.select_one("span.autor-nombre > a").string

diff --git a/scraper/views.py b/scraper/views.py
@@ -1,23 +1,16 @@
-from django.shortcuts import render
-from rest_framework import viewsets, status
-from rest_framework.decorators import action
+from rest_framework import viewsets
 from rest_framework.response import Response
 
 from scraper.models import Post
 from scraper.serializers import PostSerializer
-from scraper.services import ScraperManager
 
 
 class PostViewSet(viewsets.ModelViewSet):
-    queryset = Post.objects.all()[:10]
+    queryset = Post.objects.all()
     serializer_class = PostSerializer
 
-    @action(detail=False, methods=["get"])
-    def scrape_websites(self, request):
-        scraper_manager = ScraperManager()
-        errors = scraper_manager.run_requests()
+    def list(self, request, *args, **kwargs):
+        queryset = self.get_queryset()[:10]
 
-        if errors:
-            return Response({"errors": errors}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
-        else:
-            return Response(status=status.HTTP_200_OK)
+        serializer = self.get_serializer(queryset, many=True)
+        return Response(serializer.data)