**analysis-article_overview.ipynb - Article overview**

# Setup

## Setup - Imports

In [1]:
# python base imports
import datetime
import json

# import six
import six

print( "packages imported at " + str( datetime.datetime.now() ) )

packages imported at 2022-05-10 03:16:14.934844


## Setup - Initialize Django

- Back to [Table of Contents](#Table-of-Contents)

First, initialize my dev django project, so I can run code in this notebook that references my django models and can talk to the database using my project's settings.

In [2]:
%run ../django_init.py

django initialized at 2022-05-10 03:16:17.119833


In [3]:
# django imports
from django.contrib.auth.models import User
from django.db.models import Max
from django.db.models import Min

# sourcenet imports
from context_text.shared.context_text_base import ContextTextBase

# context_analysis imports
from context_analysis.network.network_person_info import NetworkPersonInfo

# sourcenet imports
from context_text.models import Article
from context_text.models import Article_Author
from context_text.models import Article_Data
from context_text.models import Article_Subject
from context_text.models import Newspaper
from context_text.models import Person

# article coding
from context_text.article_coding.article_coder import ArticleCoder
#from context_text.article_coding.article_coding import ArticleCoding
from context_text.article_coding.open_calais_v2.open_calais_v2_article_coder import OpenCalaisV2ArticleCoder

# context_text shared
from context_text.shared.context_text_base import ContextTextBase

print( "django model packages imported at " + str( datetime.datetime.now() ) )

django model packages imported at 2022-05-10 03:16:17.845201


## Setup - Important instances

In [4]:
# get ArticleCoding instance.
#article_coding = ArticleCoding()

# automated coding user
automated_coder = ArticleCoder.get_automated_coding_user()

# newspapers for Grand Rapids Press and Detroit News.
grand_rapids_press = Newspaper.objects.get( newsbank_code = "GRPB" )
detroit_news = Newspaper.objects.get( newsbank_code = "DTNB" )

## Setup - functions

### Setup - function create_tag_summary

In [14]:
def create_tag_summary( work_qs_IN, output_every_IN = 1000 ):
    
    # return reference
    tag_info_OUT = None
    
    # declare variables
    tag_to_count_map = dict()
    article_counter = None
    output_every = None
    work_qs = None
    article = None
    tag_list = None
    tag_name = None
    tag_count = None
    
    # init
    work_qs = work_qs_IN
    output_every = output_every_IN

    # loop
    article_counter = 0
    for article in work_qs:

        # increment article_counter
        article_counter += 1

        # get list of tags for article
        tag_list = article.tags.names()

        # loop over tags
        for tag_name in tag_list:

            # look for tag in map
            tag_count = tag_to_count_map.get( tag_name, 0 )
            tag_count += 1
            tag_to_count_map[ tag_name ] = tag_count

        #-- END loop over tags. --#

        # output?
        if ( article_counter % output_every == 0 ):

            print( "--> processed: {}".format( article_counter ) )

        #-- END check if time to output something. --#

    #-- END loop over articles. --#
    
    tag_info_OUT = tag_to_count_map

    return tag_info_OUT
    
#-- END function create_tag_summary() --#

print( "function create_tag_summary() defined at " + str( datetime.datetime.now() ) )

function create_tag_summary() defined at 2022-05-10 03:29:53.710254


# Articles

- Using details from: [newsbank-article_coding.ipynb](../data/article_coding/newsbank-article_coding.ipynb)

In [5]:
article_qs = Article.objects.all()

## Grand Rapids Press (GRP)

In [6]:
# get automated coder user.
#automated_coder = ArticleCoder.get_automated_coding_user()
my_newspaper = grand_rapids_press

# filter to just Article_Data coded by this user.
grp_article_qs = article_qs.filter( newspaper = my_newspaper )

# how many now?
article_count = grp_article_qs.count()

print( "{} Article instances for newspaper {}.".format( article_count, my_newspaper ) )

354315 Article instances for newspaper 1 - Grand Rapids Press, The ( GRPB ).


### GRP - coded tag

In [7]:
tags_in_list = []
tags_in_list.append( OpenCalaisV2ArticleCoder.TAG_CODED_BY_ME )
grp_coded_article_qs = grp_article_qs.filter( tags__name__in = tags_in_list )
print( "Tags {} - Matching article count: {}".format( tags_in_list, grp_coded_article_qs.count() ) )

Tags ['coded-OpenCalaisV2ArticleCoder'] - Matching article count: 43816


### GRP - hard news tag

In [8]:
tags_in_list = []
tags_in_list.append( ContextTextBase.TAG_LOCAL_HARD_NEWS )
grp_news_article_qs = grp_article_qs.filter( tags__name__in = tags_in_list )
print( "Tags {} - Matching article count: {}".format( tags_in_list, grp_news_article_qs.count() ) )

Tags ['local_hard_news'] - Matching article count: 43816


### GRP - date ranges

Get min and max publication dates for articles within a QuerySet.

- Django aggregation function guide: [https://docs.djangoproject.com/en/4.0/topics/db/aggregation/](https://docs.djangoproject.com/en/4.0/topics/db/aggregation/)

In [9]:
# minimum publication date
work_qs = grp_news_article_qs
aggregate_value_dict = work_qs.aggregate( Min( "pub_date" ) )  
min_pubdate = aggregate_value_dict[ "pub_date__min" ]
print( "Min pubdate: {min_pubdate} ( {agg_dict} )".format( min_pubdate = min_pubdate, agg_dict = aggregate_value_dict ) )

Min pubdate: 2005-01-01 ( {'pub_date__min': datetime.date(2005, 1, 1)} )


In [10]:
# maximum publication date
work_qs = grp_news_article_qs
aggregate_value_dict = work_qs.aggregate( Max( "pub_date" ) )  
max_pubdate = aggregate_value_dict[ "pub_date__max" ]
print( "Max pubdate: {max_pubdate} ( {agg_dict} )".format( max_pubdate = max_pubdate, agg_dict = aggregate_value_dict ) )

Max pubdate: 2010-11-30 ( {'pub_date__max': datetime.date(2010, 11, 30)} )


### GRP - tags overview 

#### GRP - all article tags

In [16]:
# declare variables
tag_to_count_map = None
tag_list = None
tag_name = None
tag_count = None

# all articles:
tag_to_count_map = create_tag_summary( grp_article_qs )

# output summary
tag_list = list( tag_to_count_map.keys() )
tag_list.sort()
for tag_name in tag_list:

    tag_count = tag_to_count_map.get( tag_name, 0 )
    print( "- tag {}: {}".format( tag_name, tag_count ) )
    
#-- END loop over tag counts --#

--> processed: 1000
--> processed: 2000
--> processed: 3000
--> processed: 4000
--> processed: 5000
--> processed: 6000
--> processed: 7000
--> processed: 8000
--> processed: 9000
--> processed: 10000
--> processed: 11000
--> processed: 12000
--> processed: 13000
--> processed: 14000
--> processed: 15000
--> processed: 16000
--> processed: 17000
--> processed: 18000
--> processed: 19000
--> processed: 20000
--> processed: 21000
--> processed: 22000
--> processed: 23000
--> processed: 24000
--> processed: 25000
--> processed: 26000
--> processed: 27000
--> processed: 28000
--> processed: 29000
--> processed: 30000
--> processed: 31000
--> processed: 32000
--> processed: 33000
--> processed: 34000
--> processed: 35000
--> processed: 36000
--> processed: 37000
--> processed: 38000
--> processed: 39000
--> processed: 40000
--> processed: 41000
--> processed: 42000
--> processed: 43000
--> processed: 44000
--> processed: 45000
--> processed: 46000
--> processed: 47000
--> processed: 48000
-

#### GRP - hard news article tags

In [15]:
# declare variables
tag_to_count_map = None
tag_list = None
tag_name = None
tag_count = None

# all articles:
tag_to_count_map = create_tag_summary( grp_news_article_qs )

# output summary
tag_list = list( tag_to_count_map.keys() )
tag_list.sort()
for tag_name in tag_list:

    tag_count = tag_to_count_map.get( tag_name, 0 )
    print( "- tag {}: {}".format( tag_name, tag_count ) )
    
#-- END loop over tag counts --#

--> processed: 1000
--> processed: 2000
--> processed: 3000
--> processed: 4000
--> processed: 5000
--> processed: 6000
--> processed: 7000
--> processed: 8000
--> processed: 9000
--> processed: 10000
--> processed: 11000
--> processed: 12000
--> processed: 13000
--> processed: 14000
--> processed: 15000
--> processed: 16000
--> processed: 17000
--> processed: 18000
--> processed: 19000
--> processed: 20000
--> processed: 21000
--> processed: 22000
--> processed: 23000
--> processed: 24000
--> processed: 25000
--> processed: 26000
--> processed: 27000
--> processed: 28000
--> processed: 29000
--> processed: 30000
--> processed: 31000
--> processed: 32000
--> processed: 33000
--> processed: 34000
--> processed: 35000
--> processed: 36000
--> processed: 37000
--> processed: 38000
--> processed: 39000
--> processed: 40000
--> processed: 41000
--> processed: 42000
--> processed: 43000
- tag coded-OpenCalaisV2ArticleCoder: 43816
- tag export_to_context-20191126-164206: 43816
- tag grp_month

## The Detroit News (TDN)

In [17]:
# get automated coder user.
#automated_coder = ArticleCoder.get_automated_coding_user()
my_newspaper = detroit_news

# filter to just Article_Data coded by this user.
tdn_article_qs = article_qs.filter( newspaper = my_newspaper )

# how many now?
article_count = tdn_article_qs.count()

print( "{} Article instances for newspaper {}.".format( article_count, my_newspaper ) )

159716 Article instances for newspaper 2 - Detroit News, The ( DTNB ).


### TDN - coded tag

In [18]:
tags_in_list = []
tags_in_list.append( OpenCalaisV2ArticleCoder.TAG_CODED_BY_ME )
tdn_coded_article_qs = tdn_article_qs.filter( tags__name__in = tags_in_list )
print( "Tags {} - Matching article count: {}".format( tags_in_list, tdn_coded_article_qs.count() ) )

Tags ['coded-OpenCalaisV2ArticleCoder'] - Matching article count: 27


### TDN - hard news tag

In [19]:
tags_in_list = []
tags_in_list.append( ContextTextBase.TAG_LOCAL_HARD_NEWS )
tdn_news_article_qs = tdn_article_qs.filter( tags__name__in = tags_in_list )
print( "Tags {} - Matching article count: {}".format( tags_in_list, tdn_news_article_qs.count() ) )

Tags ['local_hard_news'] - Matching article count: 0


### TDN - date ranges

Get min and max publication dates for articles within a QuerySet.

- Django aggregation function guide: [https://docs.djangoproject.com/en/4.0/topics/db/aggregation/](https://docs.djangoproject.com/en/4.0/topics/db/aggregation/)

In [20]:
# minimum publication date
work_qs = tdn_article_qs
aggregate_value_dict = work_qs.aggregate( Min( "pub_date" ) )  
min_pubdate = aggregate_value_dict[ "pub_date__min" ]
print( "Min pubdate: {min_pubdate} ( {agg_dict} )".format( min_pubdate = min_pubdate, agg_dict = aggregate_value_dict ) )

Min pubdate: 2005-01-01 ( {'pub_date__min': datetime.date(2005, 1, 1)} )


In [22]:
# maximum publication date
work_qs = tdn_article_qs
aggregate_value_dict = work_qs.aggregate( Max( "pub_date" ) )  
max_pubdate = aggregate_value_dict[ "pub_date__max" ]
print( "Max pubdate: {max_pubdate} ( {agg_dict} )".format( max_pubdate = max_pubdate, agg_dict = aggregate_value_dict ) )

Max pubdate: 2010-10-31 ( {'pub_date__max': datetime.date(2010, 10, 31)} )


### TDN - tags overview 

#### TDN - all article tags

In [23]:
# declare variables
tag_to_count_map = None
tag_list = None
tag_name = None
tag_count = None

# all articles:
tag_to_count_map = create_tag_summary( tdn_article_qs )

# output summary
tag_list = list( tag_to_count_map.keys() )
tag_list.sort()
for tag_name in tag_list:

    tag_count = tag_to_count_map.get( tag_name, 0 )
    print( "- tag {}: {}".format( tag_name, tag_count ) )
    
#-- END loop over tag counts --#

--> processed: 1000
--> processed: 2000
--> processed: 3000
--> processed: 4000
--> processed: 5000
--> processed: 6000
--> processed: 7000
--> processed: 8000
--> processed: 9000
--> processed: 10000
--> processed: 11000
--> processed: 12000
--> processed: 13000
--> processed: 14000
--> processed: 15000
--> processed: 16000
--> processed: 17000
--> processed: 18000
--> processed: 19000
--> processed: 20000
--> processed: 21000
--> processed: 22000
--> processed: 23000
--> processed: 24000
--> processed: 25000
--> processed: 26000
--> processed: 27000
--> processed: 28000
--> processed: 29000
--> processed: 30000
--> processed: 31000
--> processed: 32000
--> processed: 33000
--> processed: 34000
--> processed: 35000
--> processed: 36000
--> processed: 37000
--> processed: 38000
--> processed: 39000
--> processed: 40000
--> processed: 41000
--> processed: 42000
--> processed: 43000
--> processed: 44000
--> processed: 45000
--> processed: 46000
--> processed: 47000
--> processed: 48000
-