# Setup

## Setup - Imports

In [1]:
# python base imports
import datetime
import json

# import six
import six

print( "packages imported at " + str( datetime.datetime.now() ) )

packages imported at 2022-05-11 14:35:23.246217


## Setup - Initialize Django

- Back to [Table of Contents](#Table-of-Contents)

First, initialize my dev django project, so I can run code in this notebook that references my django models and can talk to the database using my project's settings.

In [2]:
%run ../django_init.py

django initialized at 2022-05-11 14:35:26.467878


In [21]:
# django imports
from django.contrib.auth.models import User
from django.db.models import Max
from django.db.models import Min

# sourcenet imports
from context_text.shared.context_text_base import ContextTextBase

# context_analysis imports
from context_analysis.network.network_person_info import NetworkPersonInfo

# sourcenet imports
from context_text.models import Article
from context_text.models import Article_Author
from context_text.models import Article_Data
from context_text.models import Article_Subject
from context_text.models import Newspaper
from context_text.models import Person

# article coding
from context_text.article_coding.article_coder import ArticleCoder
#from context_text.article_coding.article_coding import ArticleCoding
from context_text.article_coding.open_calais_v2.open_calais_v2_article_coder import OpenCalaisV2ArticleCoder

# import class that actually processes requests for outputting networks.
from context_text.export.network_output import NetworkOutput

# context_text shared
from context_text.shared.context_text_base import ContextTextBase

print( "django model packages imported at " + str( datetime.datetime.now() ) )

django model packages imported at 2022-05-11 15:14:44.090555


## Setup - shared variables

In [28]:
# get ArticleCoding instance.
#article_coding = ArticleCoding()

# automated coding user
automated_coder = ArticleCoder.get_automated_coding_user()

# newspapers for Grand Rapids Press and Detroit News.
grand_rapids_press = Newspaper.objects.get( newsbank_code = "GRPB" )
detroit_news = Newspaper.objects.get( newsbank_code = "DTNB" )

# OpenCalais v2 coder type
ocv2_coder_type = OpenCalaisV2ArticleCoder.CONFIG_APPLICATION

# Article_Data Overview

In [5]:
article_data_qs = None
article_data_count = None

# get all Article_Data.
article_data_qs = Article_Data.objects.all()

# how many we starting with?
article_data_count = article_data_qs.count()

print( "Starting with {} total Article_Data instances.".format( article_data_count ) )

Starting with 45657 total Article_Data instances.


## Detect single-name people within Article_Data

- Moved this work off to separate work notebook: [analysis-omit_single_names.ipynb](./omit_single_names.ipynb)

## Filter on coder

### Only automated coder

In [6]:
# get automated coder user.
#automated_coder = ArticleCoder.get_automated_coding_user()

# filter to just Article_Data coded by this user.
article_data_qs = article_data_qs.filter( coder = automated_coder )

# how many now?
article_data_count = article_data_qs.count()

print( "{} Article_Data instances for coder {}.".format( article_data_count, automated_coder ) )

44211 Article_Data instances for coder automated.


### Only OpenCalais_v2

In [10]:
my_coder_type = ocv2_coder_type

# filter to just Article_Data with coder_type of "OpenCalais_REST_API_v2"
article_data_qs = article_data_qs.filter( coder_type = my_coder_type )

# how many now?
article_data_count = article_data_qs.count()

print( "{} Article_Data instances for coder_type {}.".format( article_data_count, my_coder_type ) )

43843 Article_Data instances for coder_type OpenCalais_REST_API_v2.


## Filter on paper

### Only Detroit News

In [None]:
# get automated coder user.
#automated_coder = ArticleCoder.get_automated_coding_user()
my_newspaper = detroit_news

# filter to just Article_Data coded by this user.
article_data_qs = article_data_qs.filter( article__newspaper = my_newspaper )

# how many now?
article_data_count = article_data_qs.count()

print( "{} Article_Data instances for newspaper {}.".format( article_data_count, my_newspaper ) )

### Only Grand Rapids Press

In [15]:
# get automated coder user.
#automated_coder = ArticleCoder.get_automated_coding_user()
my_newspaper = grand_rapids_press

# filter to just Article_Data coded by this user.
article_data_qs = article_data_qs.filter( article__newspaper = my_newspaper )

# how many now?
article_data_count = article_data_qs.count()

print( "{} Article_Data instances for newspaper {}.".format( article_data_count, my_newspaper ) )

43816 Article_Data instances for newspaper 1 - Grand Rapids Press, The ( GRPB ).


# Article_Data for GRP analysis

In [15]:
# init
grp_article_data_qs = None
article_data_count = None
work_qs = None

In [16]:
# get all Article_Data.
grp_article_data_qs = Article_Data.objects.all()

# how many we starting with?
article_data_count = grp_article_data_qs.count()

print( "Starting with {} total Article_Data instances.".format( article_data_count ) )

Starting with 45657 total Article_Data instances.


## GRP - Only OpenCalais v.2

In [17]:
my_coder_type = ocv2_coder_type

# filter to just Article_Data with coder_type of "OpenCalais_REST_API_v2"
grp_article_data_qs = grp_article_data_qs.filter( coder = automated_coder )
grp_article_data_qs = grp_article_data_qs.filter( coder_type = my_coder_type )

# how many now?
article_data_count = grp_article_data_qs.count()

print( "{} Article_Data instances for coder_type {}.".format( article_data_count, my_coder_type ) )

43843 Article_Data instances for coder_type OpenCalais_REST_API_v2.


## GRP - Only Grand Rapids Press

In [18]:
# get automated coder user.
#automated_coder = ArticleCoder.get_automated_coding_user()
my_newspaper = grand_rapids_press

# filter to just Article_Data coded by this user.
grp_article_data_qs = grp_article_data_qs.filter( article__newspaper = my_newspaper )

# how many now?
article_data_count = grp_article_data_qs.count()

print( "{} Article_Data instances for newspaper {}.".format( article_data_count, my_newspaper ) )

43816 Article_Data instances for newspaper 1 - Grand Rapids Press, The ( GRPB ).


## GRP - date ranges

Get min and max publication dates for articles within Article_Data QuerySet.

- Django aggregation function guide: [https://docs.djangoproject.com/en/4.0/topics/db/aggregation/](https://docs.djangoproject.com/en/4.0/topics/db/aggregation/)

In [23]:
# minimum publication date
work_qs = grp_article_data_qs
aggregate_value_dict = work_qs.aggregate( Min( "article__pub_date" ) )  
min_pubdate = aggregate_value_dict[ "article__pub_date__min" ]
print( "Min pubdate: {min_pubdate} ( {agg_dict} )".format( min_pubdate = min_pubdate, agg_dict = aggregate_value_dict ) )

Min pubdate: 2005-01-01 ( {'article__pub_date__min': datetime.date(2005, 1, 1)} )


In [24]:
# maximum publication date
work_qs = grp_article_data_qs
aggregate_value_dict = work_qs.aggregate( Max( "article__pub_date" ) )  
max_pubdate = aggregate_value_dict[ "article__pub_date__max" ]
print( "Max pubdate: {max_pubdate} ( {agg_dict} )".format( max_pubdate = max_pubdate, agg_dict = aggregate_value_dict ) )

Max pubdate: 2010-11-30 ( {'article__pub_date__max': datetime.date(2010, 11, 30)} )


## GRP - missing names in Article_Subject

### GRP - Article_Subject - name

In [31]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Subject...
name_qs = Article_Subject.objects.all()
name_count = name_qs.count()
print( "Article_Subject count: {}".format( name_count ) ) 

# ...with empty name...
name_qs = name_qs.filter( name__isnull = True )
name_count = name_qs.count()
print( "Empty name count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty name by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty name by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty name only GRP articles: {}".format( name_count ) )

Article_Subject count: 220076
Empty name count: 2302
Empty name by automated coder count: 1335
Empty name by OpenCalais v2: 55
Empty name only GRP articles: 53


### GRP - Article_Subject - verbatim_name

In [32]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Subject...
name_qs = Article_Subject.objects.all()
name_count = name_qs.count()
print( "Article_Subject count: {}".format( name_count ) ) 

# ...with empty verbatim_name...
name_qs = name_qs.filter( verbatim_name__isnull = True )
name_count = name_qs.count()
print( "Empty verbatim_name count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty verbatim_name by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty verbatim_name by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty verbatim_name only GRP articles: {}".format( name_count ) )

Article_Subject count: 220076
Empty verbatim_name count: 6589
Empty verbatim_name by automated coder count: 1335
Empty verbatim_name by OpenCalais v2: 55
Empty verbatim_name only GRP articles: 53


### GRP - Article_Subject - lookup_name

In [33]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Subject...
name_qs = Article_Subject.objects.all()
name_count = name_qs.count()
print( "Article_Subject count: {}".format( name_count ) ) 

# ...with empty lookup_name...
name_qs = name_qs.filter( lookup_name__isnull = True )
name_count = name_qs.count()
print( "Empty lookup_name count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty lookup_name by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty lookup_name by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty lookup_name only GRP articles: {}".format( name_count ) )

Article_Subject count: 220076
Empty lookup_name count: 6589
Empty lookup_name by automated coder count: 1335
Empty lookup_name by OpenCalais v2: 55
Empty lookup_name only GRP articles: 53


### GRP - Article_Subject - all three

In [35]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Subject...
name_qs = Article_Subject.objects.all()
name_count = name_qs.count()
print( "Article_Subject count: {}".format( name_count ) ) 

# ...with empty name...
name_qs = name_qs.filter( name__isnull = True )
name_qs = name_qs.filter( verbatim_name__isnull = True )
name_qs = name_qs.filter( lookup_name__isnull = True )
name_count = name_qs.count()
print( "Empty all names count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty all names by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty all names by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty all names only GRP articles: {}".format( name_count ) )

Article_Subject count: 220076
Empty all names count: 2302
Empty all names by automated coder count: 1335
Empty all names by OpenCalais v2: 55
Empty all names only GRP articles: 53


## GRP - missing names in Article_Author

### GRP - Article_Author - name

In [36]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Author...
name_qs = Article_Author.objects.all()
name_count = name_qs.count()
print( "Article_Author count: {}".format( name_count ) ) 

# ...with empty name...
name_qs = name_qs.filter( name__isnull = True )
name_count = name_qs.count()
print( "Empty name count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty name by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty name by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty name only GRP articles: {}".format( name_count ) )

Article_Subject count: 47190
Empty name count: 707
Empty name by automated coder count: 393
Empty name by OpenCalais v2: 11
Empty name only GRP articles: 11


### GRP - Article_Author - verbatim_name

In [37]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Author...
name_qs = Article_Author.objects.all()
name_count = name_qs.count()
print( "Article_Author count: {}".format( name_count ) ) 

# ...with empty verbatim_name...
name_qs = name_qs.filter( verbatim_name__isnull = True )
name_count = name_qs.count()
print( "Empty verbatim_name count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty verbatim_name by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty verbatim_name by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty verbatim_name only GRP articles: {}".format( name_count ) )

Article_Author count: 47190
Empty verbatim_name count: 1809
Empty verbatim_name by automated coder count: 393
Empty verbatim_name by OpenCalais v2: 11
Empty verbatim_name only GRP articles: 11


### GRP - Article_Author - lookup_name

In [38]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Author...
name_qs = Article_Author.objects.all()
name_count = name_qs.count()
print( "Article_Author count: {}".format( name_count ) ) 

# ...with empty lookup_name...
name_qs = name_qs.filter( lookup_name__isnull = True )
name_count = name_qs.count()
print( "Empty lookup_name count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty lookup_name by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty lookup_name by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty lookup_name only GRP articles: {}".format( name_count ) )

Article_Author count: 47190
Empty lookup_name count: 1809
Empty lookup_name by automated coder count: 393
Empty lookup_name by OpenCalais v2: 11
Empty lookup_name only GRP articles: 11


### GRP - Article_Author - all three

In [39]:
# declare variables
empty_name_list = None
ad_id_to_empty_name_map = None
article_data = None
name_qs = None
name_count = None

# init
empty_name_list = list()
ad_id_to_empty_name_map = dict()

# ==> name

# get all Article_Author...
name_qs = Article_Author.objects.all()
name_count = name_qs.count()
print( "Article_Author count: {}".format( name_count ) ) 

# ...with empty name...
name_qs = name_qs.filter( name__isnull = True )
name_qs = name_qs.filter( verbatim_name__isnull = True )
name_qs = name_qs.filter( lookup_name__isnull = True )
name_count = name_qs.count()
print( "Empty all names count: {}".format( name_count ) ) 

# ...just those that are related to an automated coder...
name_qs = name_qs.filter( article_data__coder = automated_coder )
name_count = name_qs.count()
print( "Empty all names by automated coder count: {}".format( name_count ) )

# ...just those that are type "OpenCalais_REST_API_v2"...
name_qs = name_qs.filter( article_data__coder_type = ocv2_coder_type )
name_count = name_qs.count()
print( "Empty all names by OpenCalais v2: {}".format( name_count ) )

# ...only Grand Rapids Press...?
name_qs = name_qs.filter( article_data__article__newspaper = grand_rapids_press )
name_count = name_qs.count()
print( "Empty all names only GRP articles: {}".format( name_count ) )

Article_Author count: 47190
Empty all names count: 707
Empty all names by automated coder count: 393
Empty all names by OpenCalais v2: 11
Empty all names only GRP articles: 11
