In [59]:
import json  
import pandas as pd  
import requests
from requests.auth import HTTPBasicAuth
from os import path
import sys
import genson
from genson import SchemaBuilder
import hashlib

In [60]:
# return pickle cache filename for a given key
def get_cache_file_name(key, ext):
    return "./{key}.{ext}".format(key=key, ext=ext)

# Given a key return cached DataFrame or None
def get_cached_df(key):
    file_name = get_cache_file_name(key, 'pkl')
    if path.exists(file_name):
        print("Cache found: {file_name}".format(file_name=file_name))
        return pd.read_pickle(file_name)
    else:
        return None
    
# Store pickled DataFrame
def cache_df(df, key):
    file_name = get_cache_file_name(key, 'pkl')
    df.to_pickle(file_name)    
    
# Acquire text from URL or file
def read_json_text(path):
    text = None
    if path.startswith('http'):
        r = requests.get(path)
        text = r.text
    else:
        data_file = open(path)
        text = data_file.read()
    return text
        
# Load JSON into a DataFrame given the a path to the JSON data
def load_json(path):
    key = hashlib.md5(path.encode('utf-8')).hexdigest()
    df = get_cached_df(key)
    if df is not None:
        return df
        
    text = read_json_text(path)
    jsdata = json.loads(text)
    new_df = pd.json_normalize(jsdata)
            
    cache_df(new_df, key)
    return new_df

def display_schema(path):
    text = read_json_text(path)
    jsdata = json.loads(text)
    
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object(jsdata)
    schema = builder.to_schema()
    print(json.dumps(schema, indent=1))
    

In [61]:
display_schema('https://raw.githubusercontent.com/prust/wikipedia-movie-data/master/movies.json')

{
 "$schema": "http://json-schema.org/schema#",
 "anyOf": [
  {
   "type": "object"
  },
  {
   "type": "array",
   "items": {
    "type": "object",
    "properties": {
     "title": {
      "type": "string"
     },
     "year": {
      "type": "integer"
     },
     "cast": {
      "type": "array",
      "items": {
       "type": "string"
      }
     },
     "genres": {
      "type": "array",
      "items": {
       "type": "string"
      }
     }
    },
    "required": [
     "cast",
     "genres",
     "title",
     "year"
    ]
   }
  }
 ]
}


In [66]:
df = load_json('https://raw.githubusercontent.com/prust/wikipedia-movie-data/master/movies.json')
print(df.info())
df

Cache found: ./babe64edfb6d1bf102a5e33651932f64.pkl
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28795 entries, 0 to 28794
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   28795 non-null  object
 1   year    28795 non-null  int64 
 2   cast    28795 non-null  object
 3   genres  28795 non-null  object
dtypes: int64(1), object(3)
memory usage: 900.0+ KB
None


Unnamed: 0,title,year,cast,genres
0,After Dark in Central Park,1900,[],[]
1,Boarding School Girls' Pajama Parade,1900,[],[]
2,Buffalo Bill's Wild West Parad,1900,[],[]
3,Caught,1900,[],[]
4,Clowns Spinning Hats,1900,[],[]
...,...,...,...,...
28790,Bumblebee,2018,"[Hailee Steinfeld, John Cena, Jorge Lendeborg ...","[Action, Adventure, Science Fiction]"
28791,Welcome to Marwen,2018,"[Steve Carell, Leslie Mann, Diane Kruger, Falk...","[Fantasy, Drama]"
28792,Holmes and Watson,2018,"[Will Ferrell, John C. Reilly, Rebecca Hall, R...","[Action, Mystery, Comedy]"
28793,On the Basis of Sex,2018,"[Felicity Jones, Armie Hammer, Justin Theroux,...","[Biography, Drama]"
