moved out instagram crawler

k4cg · Dec 12, 2020 · d2da41d · d2da41d
1 parent 424b285
commit d2da41d
Show file tree

Hide file tree

Showing 34 changed files with 97 additions and 20 deletions.
diff --git a/python-package/HISTORY.md b/python-package/HISTORY.md
@@ -6,6 +6,10 @@ Unreleased
 
 -- see the [milestone tracking at github](https://github.com/k4cg/nichtparasoup/milestone/2).
 
+* Breaking Changes
+  * Imagecrawler for Instagram is no more part of standard distribution.
+    It might come back as a plugin one day.  
+    This crawler was just too brittle because of Instagram's WebApplicationFirewalls and other bot protections.
 * Changes
   * API supports HTTP method "GET" only. Did support all HTTP methods in the past. 
 * Added
@@ -14,7 +18,7 @@ Unreleased
   * API `/get` no longer responds false "404 EXHAUSTED" HTTP Status code.
   * `nichtparasoup.server.get_image()` no longer responds false `None`.
 * Removed
-  * Crawler `nichtparasoup.imagecrawlers.instagram.InstagramProfile` no longer has a new optional config `profile_id`.  
+  * Crawler `nichtparasoup.imagecrawlers.instagram` was removed from shipped imagecrawlers.
 
 ## 3.0.0a2
 

diff --git a/python-package/README.md b/python-package/README.md
@@ -10,10 +10,9 @@
 
 _nichtparasoup_ is a hackspaces entertainment system.
 It randomly displays images from
-[instagram](https://instagram.com),
 [pr0gramm](https://pr0gramm.com) and
 [reddit](https://reddit.com).  
-Other crawlers are currently removed, but will be back soon.
+Other crawlers are currently removed, but might be back soon as plugins.
 
 If you find an ImageCrawler for your favourite ImageBoard missing, feel free to write an own ImageCrawler therefore.  
 Contribute it to the _nichtparasoup_ project or write it as an independent plugin :-)

diff --git a/python-package/docs/imagecrawlers/index.md b/python-package/docs/imagecrawlers/index.md
@@ -2,8 +2,6 @@
 
 Adaptable: 
 * [Echo](echo.md)
-* [InstagramHashtag](instagram_hashtag.md)
-* [InstagramProfile](instagram_profile.md)
 * [Picsum](picsum.md)
 * [Pr0gramm](pr0gramm.md)
 * [Reddit](reddit.md)
diff --git a/python-package/setup.cfg b/python-package/setup.cfg
@@ -30,7 +30,6 @@ classifiers =
     Typing :: Typed
 keywords =
     image-crawler
-    instagram
     pr0gramm
     reddit
 project_urls =

diff --git a/python-package/src/nichtparasoup/config/defaults.yaml b/python-package/src/nichtparasoup/config/defaults.yaml
@@ -57,16 +57,6 @@ crawlers:
     config:
       width: 300
       height: 600
-  -
-    name: "InstagramHashtag"
-    weight: 1
-    config:
-      tag_name: "earthporn"
-  -
-    name: "InstagramProfile"
-    weight: 1
-    config:
-      user_name: "natgeo"
 
 
 ## logging settings

diff --git a/python-package/src/nichtparasoup/imagecrawlers/__init__.py b/python-package/src/nichtparasoup/imagecrawlers/__init__.py
@@ -11,7 +11,6 @@
 from .._internals import _log
 from ..core.imagecrawler import BaseImageCrawler
 from .echo import Echo
-from .instagram import InstagramHashtag, InstagramProfile
 from .picsum import Picsum
 from .pr0gramm import Pr0gramm
 from .reddit import Reddit
@@ -29,8 +28,6 @@ def _builtins() -> Dict[_ImagecrawlerName, _ImagecrawlerClass]:
             'Echo': Echo,
             'Picsum': Picsum,
             'Reddit': Reddit,
-            'InstagramProfile': InstagramProfile,
-            'InstagramHashtag': InstagramHashtag,
             'Pr0gramm': Pr0gramm,
         }
 

diff --git a/python-package/tests/unit/test_imagecrawlers/test_reddit.py b/python-package/tests/unit/test_imagecrawlers/test_reddit.py
@@ -139,7 +139,7 @@ def test_reset_done(self) -> None:
         assert not crawler.is_exhausted()
 
 
-_FILE_FETCHER = FileFetcher({  # relative to "./testdata_instagram"
+_FILE_FETCHER = FileFetcher({  # relative to "./testdata_reddit"
     '/r/aww.json?after=': 'aww.json',
     '/r/awwwwwwww.json?after=': 'awwwwwwww.json',
 }, base_url='https://www.reddit.com', base_dir=path_join(dirname(__file__), 'testdata_reddit'))

diff --git a/python-plugin-instagram/README.md b/python-plugin-instagram/README.md
@@ -0,0 +1,6 @@
+# Instagram image crawlers
+
+Were removed from the list of shipped crawlers.
+
+Instagram crawler is stable but brittle at the same time.  
+fetching the initial query has failes often due to instagram's WAF or other things.
diff --git a/...e/docs/imagecrawlers/instagram_hashtag.md → ...m/docs/imagecrawlers/instagram_hashtag.md b/...e/docs/imagecrawlers/instagram_hashtag.md → ...m/docs/imagecrawlers/instagram_hashtag.md
diff --git a/...e/docs/imagecrawlers/instagram_profile.md → ...m/docs/imagecrawlers/instagram_profile.md b/...e/docs/imagecrawlers/instagram_profile.md → ...m/docs/imagecrawlers/instagram_profile.md
diff --git a/python-plugin-instagram/examples/config/.gitignore b/python-plugin-instagram/examples/config/.gitignore
@@ -0,0 +1,4 @@
+# ignore everything besides this file and yaml files
+*
+!/.gitignore
+!/*.yaml
diff --git a/python-plugin-instagram/examples/config/defaults.yaml b/python-plugin-instagram/examples/config/defaults.yaml
@@ -0,0 +1,80 @@
+## This is a config file for nichtprasoup (v2.2)
+
+
+## WebServer config
+## type: map
+webserver:
+  ## hostname the WebServer recognizes. can also be a unix socket
+  ## type: string
+  hostname: "0.0.0.0"
+  ## port the webserver runs on
+  ## type: integer
+  ## constraint: 1 <= port <= 65535
+  port: 5000
+
+## ImageServer config
+## type: map
+imageserver:
+  ## number of images the server must keep at all time
+  ## type: integer
+  ## constraint: >= 10
+  ## optional
+  ## default: 30
+  crawler_upkeep: 30
+  ## number of seconds the server must nt be reset
+  ## type: integer
+  ## constraint: >= 600
+  ## optional
+  ## default: 3600
+  reset_timeout: 3600
+
+## list of ImageCrawlers to use.
+## ATTENTION: crawlers are treated like a unique list. the combination of type and config makes them unique
+## for a list of available types see the commandline help: nichtparasoup info --imagecrawler-list
+## for description of a crawler and how to configure, see commandline help: nichtparasoup info --imagecrawler-desc <crawler>
+## type: list
+crawlers:
+  -
+    ## name of the crawler
+    ## for a list of available types see the commandline help: nichtparasoup info --imagecrawler-list
+    ## type: string
+    name: "Reddit"
+    ## probability to be chosen randomly
+    ## type: integer or float
+    ## constraint: > 0
+    ## optional
+    ## default: 1
+    weight: 3
+    ## the crawler's own config
+    ## for description of a crawler and how to configure, see commandline help: nichtparasoup info --imagecrawler-desc <crawler>
+    ## type: map
+    ## optional
+    config:
+      subreddit: 'EarthPorn'
+  -
+    name: "Picsum"
+    weight: 1
+    config:
+      width: 300
+      height: 600
+  -
+    name: "InstagramHashtag"
+    weight: 1
+    config:
+      tag_name: "earthporn"
+  -
+    name: "InstagramProfile"
+    weight: 1
+    config:
+      user_name: "natgeo"
+
+
+## logging settings
+## type: map
+## optional
+logging:
+  ## log level settings
+  ## type: enum('CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG')
+  ## optional
+  ## default: 'INFO'
+  level: 'INFO'
diff --git a/...on-package/examples/config/instabuff.yaml → ...-instagram/examples/config/instabuff.yaml b/...on-package/examples/config/instabuff.yaml → ...-instagram/examples/config/instabuff.yaml
diff --git a/.../nichtparasoup/imagecrawlers/instagram.py → python-plugin-instagram/src/instagram.py b/.../nichtparasoup/imagecrawlers/instagram.py → python-plugin-instagram/src/instagram.py
diff --git a/...unit/test_imagecrawlers/test_instagram.py → ...-plugin-instagram/tests/test_instagram.py b/...unit/test_imagecrawlers/test_instagram.py → ...-plugin-instagram/tests/test_instagram.py
diff --git a/...awlers/testdata_instagram/1bad9348735e.js → .../tests/testdata_instagram/1bad9348735e.js b/...awlers/testdata_instagram/1bad9348735e.js → .../tests/testdata_instagram/1bad9348735e.js
diff --git a/...awlers/testdata_instagram/5dc93b582a6c.js → .../tests/testdata_instagram/5dc93b582a6c.js b/...awlers/testdata_instagram/5dc93b582a6c.js → .../tests/testdata_instagram/5dc93b582a6c.js
diff --git a/...awlers/testdata_instagram/b8052b18ef4d.js → .../tests/testdata_instagram/b8052b18ef4d.js b/...awlers/testdata_instagram/b8052b18ef4d.js → .../tests/testdata_instagram/b8052b18ef4d.js
diff --git a/...awlers/testdata_instagram/ebbdfced63f8.js → .../tests/testdata_instagram/ebbdfced63f8.js b/...awlers/testdata_instagram/ebbdfced63f8.js → .../tests/testdata_instagram/ebbdfced63f8.js
diff --git a/...test_imagecrawlers/testdata_instagram/foo → ...in-instagram/tests/testdata_instagram/foo b/...test_imagecrawlers/testdata_instagram/foo → ...in-instagram/tests/testdata_instagram/foo
diff --git a/...agecrawlers/testdata_instagram/index.html → ...agram/tests/testdata_instagram/index.html b/...agecrawlers/testdata_instagram/index.html → ...agram/tests/testdata_instagram/index.html
diff --git a/...t_imagecrawlers/testdata_instagram/natgeo → ...instagram/tests/testdata_instagram/natgeo b/...t_imagecrawlers/testdata_instagram/natgeo → ...instagram/tests/testdata_instagram/natgeo
diff --git a/...ecrawlers/testdata_instagram/natgeo.__a=1 → ...ram/tests/testdata_instagram/natgeo.__a=1 b/...ecrawlers/testdata_instagram/natgeo.__a=1 → ...ram/tests/testdata_instagram/natgeo.__a=1
diff --git a/...stdata_instagram/prepared_last_empty.json → ...stdata_instagram/prepared_last_empty.json b/...stdata_instagram/prepared_last_empty.json → ...stdata_instagram/prepared_last_empty.json
diff --git a/...3b&variables={first-1,after,tag_name-foo} → ...3b&variables={first-1,after,tag_name-foo} b/...3b&variables={first-1,after,tag_name-foo} → ...3b&variables={first-1,after,tag_name-foo}
diff --git a/...3b&variables={first-5,after,tag_name-foo} → ...3b&variables={first-5,after,tag_name-foo} b/...3b&variables={first-5,after,tag_name-foo} → ...3b&variables={first-5,after,tag_name-foo}
diff --git a/...a0b62&variables={first-1,after,id-787132} → ...a0b62&variables={first-1,after,id-787132} b/...a0b62&variables={first-1,after,id-787132} → ...a0b62&variables={first-1,after,id-787132}
diff --git a/...a0b62&variables={first-5,after,id-787132} → ...a0b62&variables={first-5,after,id-787132} b/...a0b62&variables={first-5,after,id-787132} → ...a0b62&variables={first-5,after,id-787132}
diff --git a/...7620e&variables={first-1,after,id-787132} → ...7620e&variables={first-1,after,id-787132} b/...7620e&variables={first-1,after,id-787132} → ...7620e&variables={first-1,after,id-787132}
diff --git a/...efd0d&variables={first-1,after,id-787132} → ...efd0d&variables={first-1,after,id-787132} b/...efd0d&variables={first-1,after,id-787132} → ...efd0d&variables={first-1,after,id-787132}
diff --git a/...0d&variables={first-1,after,tag_name-foo} → ...0d&variables={first-1,after,tag_name-foo} b/...0d&variables={first-1,after,tag_name-foo} → ...0d&variables={first-1,after,tag_name-foo}
diff --git a/...6efd0d&variables={first1,after,id-787132} → ...6efd0d&variables={first1,after,id-787132} b/...6efd0d&variables={first1,after,id-787132} → ...6efd0d&variables={first1,after,id-787132}
diff --git a/...b634a&variables={first-1,after,id-787132} → ...b634a&variables={first-1,after,id-787132} b/...b634a&variables={first-1,after,id-787132} → ...b634a&variables={first-1,after,id-787132}
diff --git a/...4a&variables={first-1,after,tag_name-foo} → ...4a&variables={first-1,after,tag_name-foo} b/...4a&variables={first-1,after,tag_name-foo} → ...4a&variables={first-1,after,tag_name-foo}