**_analysis-fix_Article_Data_Notes_JSON.ipynb_ - convert JSON strings to JSON objects in content_json**

# Setup

## Setup - Imports

In [None]:
# python base imports
import datetime
import gc
import json

# import six
import six

print( "packages imported at " + str( datetime.datetime.now() ) )

## Setup - Initialize Django

- Back to [Table of Contents](#Table-of-Contents)

First, initialize my dev django project, so I can run code in this notebook that references my django models and can talk to the database using my project's settings.

In [None]:
%run ../django_init.py

In [None]:
# django imports
from django.contrib.auth.models import User

# sourcenet imports
from context_text.shared.context_text_base import ContextTextBase

# context_analysis imports
from context_analysis.network.network_person_info import NetworkPersonInfo

# sourcenet imports
from context_text.models import Article
from context_text.models import Article_Author
from context_text.models import Article_Data
from context_text.models import Article_Data_Notes
from context_text.models import Article_Subject
from context_text.models import Newspaper
from context_text.models import Person

# article coding
from context_text.article_coding.article_coder import ArticleCoder
#from context_text.article_coding.article_coding import ArticleCoding
from context_text.article_coding.open_calais_v2.open_calais_v2_article_coder import OpenCalaisV2ArticleCoder

# import class that actually processes requests for outputting networks.
from context_text.export.network_output import NetworkOutput

# context_text shared
from context_text.shared.context_text_base import ContextTextBase

print( "django model packages imported at " + str( datetime.datetime.now() ) )

## Setup - Important instances

In [None]:
# get ArticleCoding instance.
#article_coding = ArticleCoding()

# automated coding user
automated_coder = ArticleCoder.get_automated_coding_user()

# newspapers for Grand Rapids Press and Detroit News.
grand_rapids_press = Newspaper.objects.get( newsbank_code = "GRPB" )
detroit_news = Newspaper.objects.get( newsbank_code = "DTNB" )

# fix Article_Data_Notes JSON

## Check out Article_Data_Notes instance

First, just play with one, to see what we need to do.

In [None]:
# declare variables
notes_id = None
notes_qs = None
article_data_note = None
article_data_note_id = None
note_content = None
note_content_json = None
status_message = None

# init
notes_id = 12

# retrieve an Article_Data_Notes to play with
notes_qs = Article_Data_Notes.objects.all()
#notes_qs = notes_qs.filter( id = notes_id )
notes_qs = notes_qs.filter( content__isnull = True )

# add loop, so I can see a few more than one at a time.
for article_data_note in notes_qs:
    
    # get data from model instance
    article_data_note_id = article_data_note.id
    note_content = article_data_note.content
    note_content_json = article_data_note.content_json
    
    # Hi!
    status_message = "\nArticle_Data_Notes {}:".format( article_data_note_id )
    print( status_message )

    # look at content
    if ( note_content is not None ):
        status_message = "- content: type = {note_type}; len = {note_length}".format(
            note_type = type( note_content ),
            note_length = len( note_content )
        )
    else:
        status_message = "- content: type = {note_type}".format(
            note_type = type( note_content )
        )
    #-- END check if content is empty --#
    print( status_message )

    # look at content_json
    status_message = "- content_json: type = {note_type}; len = {note_length}".format(
        note_type = type( note_content_json ),
        note_length = len( note_content_json )
    )
    print( status_message )

    # try storing content_json string in content
    #article_data_note.content = note_content_json
    #article_data_note.save()

    # are the content and content_json the same?
    if ( note_content == note_content_json ):

        print( "- TWINS!" )

    else:

        print( "- not the same." )

    #-- END check if same --#

    # what do I need to do to be able to treat it like JSON?
    #test_json = json.loads( note_content_json )
    #test_value = test_json.get( "doc", None )
    #print( test_value )

    # try chucking this parsed JSON back into content_json and saving.
    #article_data_note.content_json = test_json
    #article_data_note.save()

#-- END loop over Article_Data_Notes --#

## Fix Article_Data_Notes unparsed content_json

To fix:

- loop over records
- for each where:

    - no content
    - content_json set
    - content_json is type string
    
        - retrieve content_json.
        - store it in content
        - parse it into JSON.
        - store the JSON in content_json.
        - save()

In [None]:
# declare variables
article_data_notes_qs = None
article_data_note = None
note_id = None
note_content = None
note_content_json = None
content_json_parsed = None
content_json_string = None

# declare variables - process control
notes_counter = None
notes_count = None
output_every_x = None
limit_count = None
do_update = None
content_populated_id_list = None
content_json_parsed_id_list = None
content_json_unknown_type_id_list = None
content_json_empty_id_list = None
updated_id_list = None
do_save = None

# init
limit_count = 1000
do_save = True
output_every_x = 100
content_populated_id_list = list()
content_json_parsed_id_list = list()
content_json_unknown_type_id_list = list()
content_json_empty_id_list = list()
updated_id_list = list()

# retrieve Article_Data_Notes to play with
article_data_notes_qs = Article_Data_Notes.objects.all()
article_data_notes_qs = article_data_notes_qs.filter( content__isnull = True )
remaining_count = article_data_notes_qs.count()
article_data_notes_qs = article_data_notes_qs[ : limit_count ]
notes_count = article_data_notes_qs.count()

status_message = "Processing {remaining} records @ {right_now}.".format(
    remaining = remaining_count,
    right_now = datetime.datetime.now()
)
print( status_message )

while notes_count > 0:

    status_message = "\nBeginning Article_Data_Notes update ( {notes_count} of remaining {remaining} records ) @ {right_now}.".format(
        notes_count = notes_count,
        remaining = remaining_count,
        right_now = datetime.datetime.now()
    )
    print( status_message )

    # loop
    notes_counter = 0
    for article_data_note in article_data_notes_qs:

        # init
        do_update = True
        notes_counter += 1

        # get content and content_json
        note_id = article_data_note.id
        note_content = article_data_note.content
        note_content_json = article_data_note.content_json

        # is content empty?
        if ( ( note_content is not None ) and ( note_content != "" ) ):

            # content is not empty.
            do_update = False
            content_populated_id_list.append( note_id )

        # is content_json populated?
        elif ( note_content_json is not None ):

            # content_json has something in it. Is it a string?
            if ( isinstance( note_content_json, str ) == True ):

                # it is a string. OK to try update.
                do_update = True

            elif ( isinstance( note_content_json, dict ) == True ):

                # dictionary - Already parsed.
                do_update = True
                content_json_parsed_id_list.append( note_id )

            else:

                # not a string or a dictionary. Already parsed?
                do_update = False
                content_json_unknown_type_id_list.append( note_id )

            #-- END

        else:

            # no content or content_json. nothing to do.
            do_update = False
            content_json_empty_id_list.append( note_id )

        #-- END check if note_content_json populated --#

        # do the update?
        if ( do_update == True ):

            # already parsed?
            if ( isinstance( note_content_json, str ) == True ):

                # not already parsed - move content_json to content
                article_data_note.content = note_content_json

                # parse the content_json string
                content_json_parsed = json.loads( note_content_json )

                # store parsed JSON back in content_json
                article_data_note.content_json = content_json_parsed

            elif ( isinstance( note_content_json, dict ) == True ):

                # content already parsed - convert from dictionary to string
                content_json_string = json.dumps( note_content_json, sort_keys = True, indent = 4, separators = ( ',', ': ' ) )
                
                # store resulting string in content
                article_data_note.content = content_json_string

            else:

                # not a string or a dictionary. Already parsed?
                raise Exception( "ERROR - content_json is neither str nor dict ( type = {the_type} ).".format( the_type = type( note_content_json ) ) ) 

            #-- END check if JSON already parsed. --#

            # and save().
            if ( do_save == True ):
                article_data_note.save()
            #-- END check if we want to save. --#

            # add ID to the list.
            updated_id_list.append( note_id )

        #-- END check to see if update --#

        if ( ( notes_counter % output_every_x ) == 0 ):

            # update time!
            status_message = "\n--> Finished {counter} of {total} Article_Data_Notes records @ {right_now}".format(
                counter = notes_counter,
                total = notes_count,
                right_now = datetime.datetime.now()
            )
            print( status_message )
            print( "- Updated count: {}".format( len( updated_id_list ) ) )

            # garbage collect?
            gc.collect()

        #-- END check if we output an update --#

    #-- END loop over Article_Data_Notes --#
    
    # retrieve Article_Data_Notes to play with
    article_data_notes_qs = Article_Data_Notes.objects.all()
    article_data_notes_qs = article_data_notes_qs.filter( content__isnull = True )
    remaining_count = article_data_notes_qs.count()
    article_data_notes_qs = article_data_notes_qs[ : limit_count ]
    notes_count = article_data_notes_qs.count()

    print( "\nArticle_Data_Notes update complete @ {}.".format( datetime.datetime.now() ) )
    print( "- Updated count: {}".format( len( updated_id_list ) ) )
    print( "- no update - content has value count: {}".format( len( content_populated_id_list ) ) )
    print( "- no update - content_json is dict count: {}".format( len( content_json_parsed_id_list ) ) )
    print( "- no update - content_json unknown type count: {}".format( len( content_json_unknown_type_id_list ) ) )
    print( "- no update - content_json empty count: {}".format( len( content_json_empty_id_list ) ) )

#-- END while loop over subsets of Article_Data_Notes.objects.all() --#

print( "\nArticle_Data_Notes update complete @ {}.".format( datetime.datetime.now() ) )
print( "- Updated count: {}".format( len( updated_id_list ) ) )
print( "- no update - content has value count: {}".format( len( content_populated_id_list ) ) )
print( "- no update - content_json is dict count: {}".format( len( content_json_parsed_id_list ) ) )
print( "- no update - content_json unknown type count: {}".format( len( content_json_unknown_type_id_list ) ) )
print( "- no update - content_json empty count: {}".format( len( content_json_empty_id_list ) ) )
