**analysis-network_data_output-GRP.ipynb - Programmatic network data output**

# Setup

## Setup - Debug

- Back to [Table of Contents](#Table-of-Contents)

In [None]:
debug_flag = False

## Setup - Imports

In [None]:
# python base imports
import copy
import datetime
import hashlib
import json
import logging

# import six
import six

print( "packages imported at " + str( datetime.datetime.now() ) )

## Setup - working folder paths

- Back to [Table of Contents](#Table-of-Contents)

In [None]:
%pwd

In [None]:
# current working folder
project_name = "research"
project_base_folder = "/home/jonathanmorgan/work/django/{project_name}".format( project_name = project_name )
django_project_folder = "{base_folder}/{project_name}".format(
    base_folder = project_base_folder,
    project_name = project_name
)
current_working_folder = "{django_project_folder}/work/phd_work/analysis/network_data".format(
    django_project_folder = django_project_folder
)
current_datetime = datetime.datetime.now()
current_date_string = current_datetime.strftime( "%Y-%m-%d-%H-%M-%S" )

## Setup - logging

- Back to [Table of Contents](#Table-of-Contents)

configure logging for this notebook's kernel (If you do not run this cell, you'll get the django application's logging configuration.

In [None]:
# build file name
project_log_folder = "{base_folder}/logs".format( base_folder = project_base_folder )
logging_file_name = "{}/network_data_output-GRP-{}.log.txt".format( project_log_folder, current_date_string )

# set up logging.
logging.basicConfig(
    level = logging.DEBUG,
    format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    filename = logging_file_name,
    filemode = 'w' # set to 'a' if you want to append, rather than overwrite each time.
)

## Setup - Initialize Django

- Back to [Table of Contents](#Table-of-Contents)

First, initialize my dev django project, so I can run code in this notebook that references my django models and can talk to the database using my project's settings.

In [None]:
# init django
django_init_folder = "{django_project_folder}/work/phd_work".format(
    django_project_folder = django_project_folder
)
django_init_path = "django_init.py"
if( ( django_init_folder is not None ) and ( django_init_folder != "" ) ):
    
    # add folder to front of path.
    django_init_path = "{}/{}".format( django_init_folder, django_init_path )
    
#-- END check to see if django_init folder. --#

In [None]:
%run $django_init_path

### Setup - django-related imports

In [None]:
# python utilities
from python_utilities.strings.string_helper import StringHelper

# import class that actually processes requests for outputting networks.
from context_text.export.network_output import NetworkOutput

print( "django model packages imported at " + str( datetime.datetime.now() ) )

## Setup - functions

### Setup - function `make_string_hash()`

In [None]:
def make_string_hash( value_IN, hash_function_IN = hashlib.sha256 ):

    # return reference
    value_OUT = None

    # declare variables
    me = "make_string_hash"

    # call StringHelper method.
    value_OUT = StringHelper.make_string_hash( value_IN, hash_function_IN = hash_function_IN )

    return value_OUT

#-- END function make_string_hash() --#

print( "function make_string_hash() defined at " + str( datetime.datetime.now() ) )

## Setup - base data spec

Network data spec that includes:

- `Article_Data` and `Person` queries the same...:

    - _`coders` (`person_coders`)_: 2 (automated coder, id = 2)
    - coder type "OpenCalais_REST_API_v2"
    
        - _`coder_type_filter_type` (`person_coder_type_filter_type`)_: "automated"
        - _`coder_types_list` (`person_coder_types_list`)_: "OpenCalais_REST_API_v2"
    
    - _`publications` (`person_publications`)_: 1 (Grand Rapids Press)
    - all dates in database (from 2005-01-01 to 2010-11-30)
    
        - _`start_date` (`person_start_date`)_: "2005-01-01"
        - _`end_date` (`person_end_date`)_: "2010-11-30"
    
    - only articles tagged with `local_hard_news` and `coded-OpenCalaisV2ArticleCoder`.

        - _`tags_list` (`person_tags_list`)_: "local_hard_news,coded-OpenCalaisV2ArticleCoder"

- ...EXCEPT allowing duplicate articles for person so you get absolutely all persons, but not for `Article_Data` query.

    - _`person_allow_duplicate_articles`_: "yes"

- Network data creation options:

    - excludes persons with single word (no spaces) `verbatim_name`.
    
        - _`include_persons_with_single_word_name`_: "no"
    
    - exclude render details
        
        - _`network_include_render_details`_: "no"
        
    - ouput as tab-delimited matrix, with node attributes as additional columns on the far right of the square network part of the matrix.

        - _`output_type`_: "tab_delimited_matrix"
        - _`network_data_output_type`_: "net_and_attr_cols"

    - label - _`network_label`_: "all_grp_hard_news"
    - include header row in the matrix output file.
    
        - _`network_include_headers`_: "yes"

    - output spec plus the resulting network data to the database, with lable set to `network_label` plus a date-time string.
    
        - _`database_output`_: "yes",
        - _`db_add_timestamp_to_label`_: "yes"

_NOTE: only pass True to `network_outputter.process_network_output_request( debug_flag_IN )` if you really need to debug - it adds garbage data at the end of the output, even if you ask for no render details._


In [None]:
base_data_spec_json_string = """{
    "start_date": "2005-01-01",
    "end_date": "2005-12-31",
    "date_range": "",
    "publications": "1",
    "coders": "2",
    "coder_id_priority_list": "",
    "coder_type_filter_type": "automated",
    "coder_types_list": "OpenCalais_REST_API_v2",
    "tags_list": "local_hard_news",
    "unique_identifiers": "",
    "allow_duplicate_articles": "no",
    "person_query_type": "custom",
    "person_start_date": "2005-01-01",
    "person_end_date": "2010-11-30",
    "person_date_range": "",
    "person_publications": "1",
    "person_coders": "2",
    "person_coder_id_priority_list": "",
    "person_coder_type_filter_type": "automated",
    "person_coder_types_list": "OpenCalais_REST_API_v2",
    "person_tags_list": "local_hard_news",
    "person_unique_identifiers": "",
    "person_allow_duplicate_articles": "yes",
    "include_source_contact_types": [
        "direct",
        "event",
        "past_quotes",
        "document",
        "other"
    ],
    "exclude_persons_with_tags_in_list": "",
    "include_persons_with_single_word_name": "no",
    "network_download_as_file": "no",
    "network_include_render_details": "yes",
    "output_type": "tab_delimited_matrix",
    "network_data_output_type": "net_and_attr_cols",
    "network_label": "all_grp_hard_news",
    "network_include_headers": "yes",
    "database_output": "yes",
    "db_add_timestamp_to_label": "yes"
}"""

base_data_spec_json = json.loads( base_data_spec_json_string )
print( base_data_spec_json ) 

### Setup - update base data spec for different time slices

To update this for different time slices:

- make a copy of `base_data_spec_json`:

    - not threadsafe:
    
            my_timeslice_spec = copy.deepcopy( base_data_spec_json )
    
    - threadsafe (but doesn't handle complex data types - ours is just JSON, though, so fine here):
    
            my_timeslice_spec = json.loads( json.dumps( base_data_spec_json ) )

- update the `start_date` and `end_date` to the period you want for your time slice.

        my_timeslice_spec[ NetworkOutput.PARAM_START_DATE ] = "2009-12-01"
        my_timeslice_spec[ NetworkOutput.PARAM_END_DATE ] = "2009-12-31"

- update the `network_label` value so that it captures what time slice you are making.

        my_timeslice_spec[ NetworkOutput.PARAM_NETWORK_LABEL ] = "month-grp-automated-20091201-20091231"

    - example pattern: <type>-<paper>-<coder>-<start_date>-<end_date>
    - examples:
        
            week-grp-automated-20050501-20050507
            7day-grp-automated-20050502-20050508

    - type would be either:

        - actual time period:

            - week
            - month
            - quarter
            - half-year
            - year

        - conceptual time period:

            - sliding week = "7day"
            - sliding month = "31day"
            - sliding quarter = "92day"
            - sliding half-year = "183day"
            - sliding year = "365day"

_NOTE: leave person query parameters the same for all networks if you want all your network matrices to have same set of people (same count and position of rows and columns) so each network can be compared to all others, regardless of time period of a given network slice._

# network data output example - base data spec

In [None]:
# try creating network data.
network_outputter = NetworkOutput()
network_data = network_outputter.process_network_output_request(
    params_IN = base_data_spec_json,
    debug_flag_IN = True
)

- if include_persons_with_single_word_name = "yes": 2427606
- if include_persons_with_single_word_name = "no": 2344545

In [None]:
# create a hash of the data, for comparison
network_data_hash = make_string_hash( network_data )
print( "Network data hash: {}".format( network_data_hash ) )

# match?
should_be = "0f8a530f18a724b3d724d7fe9caa3082954c049abdc02b77bc480fc432d0a770"
if ( network_data_hash != should_be ):
    
    # not right hash. Error.
    print( "ERROR! network data hash is {}, should be {}".format( network_data_hash, should_be ) )
    
else:
    
    # a match
    print( "MATCH - network data hash {} matches expected. hooray!".format( network_data_hash ) )
    
#-- END debug/test --#

In [None]:
network_data_length = len( network_data )
should_be = 11534
print( "Network data length: {}".format( network_data_length ) )
if ( network_data_length != should_be ):
    
    # not right length. Error.
    print( "ERROR! network data length is {}, should be {}".format( network_data_length, should_be ) )
    
else:
    
    # a match
    print( "MATCH - string len()gth of {} matches expected. hooray!".format( network_data_length ) )
    
#-- END debug/test --#

In [None]:
# look at master person dict
master_person_dict = network_outputter.create_person_dict( load_person_IN = True )

# how many entries?
person_count = len( master_person_dict )
print( "- person count: {person_count}".format( person_count = person_count ) )

# right number?
should_be = 66
if ( person_count != should_be ):
    
    # not right length. Error.
    print( "ERROR! person count is {}, should be {}".format( person_count, should_be ) )
    
else:
    
    # a match
    print( "MATCH - person count of {} matches expected. hooray!".format( person_count ) )
    
#-- END debug/test --#

# the following persons should not be present
find_person_list = list()

# 1049, 752 (single names)
find_person_list.append( 1049 )
find_person_list.append( 752 )

# 102, 224, 261 (tag `from_press_release`)
find_person_list.append( 102 )
find_person_list.append( 224 )
find_person_list.append( 261 )

# 187, 188, 189 (tag `godwin_heights`)
find_person_list.append( 187 )
find_person_list.append( 188 )
find_person_list.append( 189 )

# check for people who should have been removed.
for find_person_id in find_person_list:

    if ( find_person_id in master_person_dict ):
    
        print( "ERROR - single-name person {} is in dictionary".format( find_person_id ) )
    
    else:
    
        print( "SUCCESS - single-name person {} not in dictionary".format( find_person_id ) )
    
    #-- END check for person --#

#-- END loop over persons to find. --#

# output all persons.
for person_id, person_instance in master_person_dict.items():
    
    print( "\n==> Person {person_id}: {person_instance}".format( person_id = person_id, person_instance = person_instance ) )
    
#-- END loop over persons --#

# write network data to file

In [None]:
# write the output to a file
current_date_time = None
my_file_extension = None
network_data_file_path = None
network_data_file = None

# time stamp and file extension to append to file name
current_date_time = datetime.datetime.now().strftime( '%Y%m%d-%H%M%S' )
my_file_extension = "txt"

# make file path.
network_data_file_path = "context_text_data-{timestamp}.{file_extension}".format(
    timestamp = current_date_time,
    file_extension = my_file_extension
)

# write to file.
with open( network_data_file_path, 'w' ) as network_data_file:

    # output all the data to file.
    network_data_file.write( network_data )
    
#-- END with open( network_data_file_path, 'w' ) as network_data_file --#

print( "network data written to file {} at {}".format( network_data_file_path, datetime.datetime.now() ) )