Skip to content

Commit

Permalink
Save metadata JSON with --metadata-json
Browse files Browse the repository at this point in the history
With --metadata-json, a JSON file for each post is created saving the
Post properties defined in instaloader.Post class, i.e. caption, number
of likes, people tagged in caption or the picture itself, etc.

This closes #33 and closes #47.
  • Loading branch information
aandergr committed Sep 29, 2017
1 parent e471bd5 commit e0ed4cf
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 15 deletions.
7 changes: 5 additions & 2 deletions docs/basic-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,13 @@ Instaloader supports the following targets:
Instaloader goes through all media matching the specified targets and
downloads the pictures and videos and their captions. You can specify

- :option:`--comments`, to also **download comments** of each post and
- :option:`--comments`, to also **download comments** of each post,

- :option:`--geotags`, to **download geotags** of each post and save them as
Google Maps link.
Google Maps link,

- :option:`--metadata-json`, to store further post metadata in a separate JSON
file.

.. _filename-specification:

Expand Down
7 changes: 7 additions & 0 deletions docs/cli-options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ Instead of a *profile* or a *#hashtag*, the special targets
Also **download stories** of each profile that is downloaded. Requires
:option:`--login`.

.. option:: --metadata-json

Create a JSON file containing the metadata of each post. This does not
include comments (see :option:`--comments`) nor geotags (see
:option:`--geotags`). The JSON files contain the properties of
:class:`instaloader.Post`.

.. option:: --stories-only

Rather than downloading regular posts of each specified profile, only
Expand Down
58 changes: 45 additions & 13 deletions instaloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ class Post:
metadata, if required. This class unifies access to the properties associated with a post. It implements == and is
hashable.
The properties defined here are accessable by the filter expressions specified with the :option:`--only-if`
parameter.
The properties defined here are accessible by the filter expressions specified with the :option:`--only-if`
parameter and exported into JSON files with :option:`--metadata-json`.
"""

LOGIN_REQUIRING_PROPERTIES = ["viewer_has_liked"]
Expand Down Expand Up @@ -361,6 +361,22 @@ def get_location(self) -> Optional[Dict[str, str]]:
params={'__a': 1})
return location_json["location"]

@staticmethod
def json_encoder(obj) -> Dict[str, Any]:
"""Convert instance of :class:`Post` to a JSON-serializable dictionary."""
if not isinstance(obj, Post):
raise TypeError("Object of type {} is not a Post object.".format(obj.__class__.__name__))
jsondict = {}
for prop in dir(Post):
if prop[0].isupper() or prop[0] == '_':
# skip uppercase and private properties
continue
val = obj.__getattribute__(prop)
if val is True or val is False or isinstance(val, (str, int, float, list)):
jsondict[prop] = val
elif isinstance(val, datetime):
jsondict[prop] = val.isoformat()
return jsondict

class Tristate(Enum):
"""Tri-state to encode whether we should save certain information, i.e. videos, captions, comments or geotags.
Expand All @@ -387,8 +403,9 @@ def __init__(self,
filename_pattern: Optional[str] = None,
download_videos: Tristate = Tristate.always,
download_geotags: Tristate = Tristate.no_extra_query,
download_captions: Tristate = Tristate.no_extra_query,
download_comments: Tristate = Tristate.no_extra_query):
save_captions: Tristate = Tristate.no_extra_query,
download_comments: Tristate = Tristate.no_extra_query,
save_metadata: Tristate = Tristate.never):

# configuration parameters
self.user_agent = user_agent if user_agent is not None else default_user_agent()
Expand All @@ -401,16 +418,15 @@ def __init__(self,
if filename_pattern is not None else '{date:%Y-%m-%d_%H-%M-%S}'
self.download_videos = download_videos
self.download_geotags = download_geotags
self.download_captions = download_captions
self.save_captions = save_captions
self.download_comments = download_comments
self.previous_queries = dict()
self.save_metadata = save_metadata

# error log, filled with error() and printed at the end of Instaloader.main()
self.error_log = []

# For the adaption of sleep intervals (rate control)
self.request_count = 0
self.last_request_time = 0
self.previous_queries = dict()

@property
def is_logged_in(self) -> bool:
Expand All @@ -423,7 +439,7 @@ def anonymous_copy(self):
new_loader = Instaloader(self.sleep, self.quiet, self.user_agent,
self.dirname_pattern, self.filename_pattern,
self.download_videos, self.download_geotags,
self.download_captions, self.download_comments)
self.save_captions, self.download_comments)
new_loader.previous_queries = self.previous_queries
yield new_loader
self.error_log.extend(new_loader.error_log)
Expand Down Expand Up @@ -688,6 +704,12 @@ def download_pic(self, filename: str, url: str, mtime: datetime,
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
return True

def save_metadata_json(self, filename: str, post: Post) -> None:
"""Saves metadata JSON file of a :class:`Post`."""
filename += '.json'
json.dump(post, fp=open(filename, 'w'), indent=4, default=Post.json_encoder)
self._log('json', end=' ', flush=True)

def update_comments(self, filename: str, post: Post) -> None:
filename += '_comments.json'
try:
Expand Down Expand Up @@ -885,7 +907,7 @@ def download_post(self, post: Post, target: str) -> bool:
downloaded = False

# Save caption if desired
if self.download_captions is not Tristate.never:
if self.save_captions is not Tristate.never:
if post.caption:
self.save_caption(filename, post.date, post.caption)
else:
Expand All @@ -905,6 +927,11 @@ def download_post(self, post: Post, target: str) -> bool:
if self.download_comments is Tristate.always:
self.update_comments(filename, post)

# Save metadata as JSON if desired. It might require an extra query, depending on which information has been
# already obtained. Regarding Tristate interpretation, we always assume that it requires an extra query.
if self.save_metadata is Tristate.always:
self.save_metadata_json(filename, post)

self._log()
return downloaded

Expand Down Expand Up @@ -989,7 +1016,7 @@ def download_stories(self,
self._log("Warning: Unable to find story image.")
downloaded = False
if "caption" in item and item["caption"] is not None and \
self.download_captions is not Tristate.never:
self.save_captions is not Tristate.never:
caption = item["caption"]
if isinstance(caption, dict) and "text" in caption:
caption = caption["text"]
Expand Down Expand Up @@ -1385,6 +1412,9 @@ def main():
'server for each post, which is why it is disabled by default.')
g_what.add_argument('--no-captions', action='store_true',
help='Do not store media captions, although no additional request is needed to obtain them.')
g_what.add_argument('--metadata-json', action='store_true',
help='Create a JSON file containing the metadata of each post. This does not include comments '
'nor geotags.')
g_what.add_argument('-s', '--stories', action='store_true',
help='Also download stories of each profile that is downloaded. Requires --login.')
g_what.add_argument('--stories-only', action='store_true',
Expand Down Expand Up @@ -1458,7 +1488,8 @@ def main():

download_videos = Tristate.always if not args.no_videos else Tristate.no_extra_query
download_comments = Tristate.always if args.comments else Tristate.no_extra_query
download_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never
save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never
save_metadata = Tristate.always if args.metadata_json else Tristate.never

if args.geotags and args.no_geotags:
raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.")
Expand All @@ -1473,7 +1504,8 @@ def main():
user_agent=args.user_agent,
dirname_pattern=args.dirname_pattern, filename_pattern=args.filename_pattern,
download_videos=download_videos, download_geotags=download_geotags,
download_captions=download_captions, download_comments=download_comments)
save_captions=save_captions, download_comments=download_comments,
save_metadata=save_metadata)
loader.main(args.profile, args.login.lower() if args.login is not None else None, args.password,
args.sessionfile,
int(args.count) if args.count is not None else None,
Expand Down

0 comments on commit e0ed4cf

Please sign in to comment.