In [1]:
# Allows us to import packages that exist one level up in the file system
# See https://stackoverflow.com/questions/34478398
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path = [module_path] + sys.path

In [2]:
from tagnews.utils import load_data as ld
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tagnews

In [3]:
crimetags = tagnews.CrimeTags()
df = ld.load_data()
df['timestamp'] = pd.to_datetime(df['created'], utc=True)
is_2017 = (pd.to_datetime('2017', utc=True) <= df['timestamp']) & (df['timestamp'] < pd.to_datetime('2018', utc=True))
df = df.loc[is_2017]
preds = df['bodytext'].apply(crimetags.tagtext_proba)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
print('{} total articles in 2017'.format(df.shape[0]))
print('{} have a tag with > 75% chance of applying'.format((preds > 0.75).any(axis=1).sum()))
print('{} have a tag with > 85% chance of applying'.format((preds > 0.85).any(axis=1).sum()))
print('\n\nPer Category')
print(pd.DataFrame({'75% or more': (preds > 0.75).sum(axis=0),
                    '85% or more': (preds > 0.85).sum(axis=0)}))


139339 total articles in 2017
47202 have a tag with > 75% chance of applying
40933 have a tag with > 85% chance of applying


Per Category
       75% or more  85% or more
ARSN           273          196
BEAT           103           73
BURG          1067          915
CCCC           736          533
CCJ            337          264
CCSP           715          579
CPBD            23           16
CPD          12016         9006
CPLY          1389          997
CPS           2115         1754
CPUB            11            8
DOMV           616          466
DRUG          1751         1550
DUI            480          427
ENVI             9            2
FRUD          1232          997
GANG          1220         1017
GLBTQ          817          690
GUNV         14071        13107
HOMI          8957         7808
IDOC           102           76
ILSC            14           12
ILSP           597          462
IMMG           869          717
IPRA           132          106
JUVE          2086         15

In [5]:
pd.DataFrame({'75% or more': (preds > 0.75).sum(axis=0),
              '85% or more': (preds > 0.85).sum(axis=0)}).to_csv('2017-counts-per-crimetype.csv')

In [6]:
preds.columns = ['model-' + c for c in preds.columns]
pd.concat([df, preds], axis=1).loc[(preds > 0.75).any(axis=1), :].to_csv('2017-articles-75-percent.csv')
pd.concat([df, preds], axis=1).loc[(preds > 0.85).any(axis=1), :].to_csv('2017-articles-85-percent.csv')

In [7]:
df.columns

Index(['feedname', 'url', 'title', 'bodytext', 'relevant', 'created',
       'last_modified', 'news_source_id', 'author', 'locations', 'OEMC', 'CPD',
       'SAO', 'CCCC', 'CCJ', 'CCSP', 'CPUB', 'IDOC', 'DOMV', 'SEXA', 'POLB',
       'POLM', 'GUNV', 'GLBTQ', 'JUVE', 'REEN', 'VIOL', 'BEAT', 'PROB', 'PARL',
       'CPLY', 'DRUG', 'CPS', 'GANG', 'ILSP', 'HOMI', 'IPRA', 'CPBD', 'IMMG',
       'ENVI', 'UNSPC', 'ILSC', 'ARSN', 'BURG', 'DUI', 'FRUD', 'ROBB', 'TASR',
       'COPA', 'DIGP', 'timestamp'],
      dtype='object')

In [8]:
df['feedname'][~pd.isnull(df['feedname'])].value_counts()

T    6018
L    2655
O    2133
G    1840
S    1413
F    1284
A    1238
B    1159
V     718
I     713
R     456
Z     381
D     261
W     252
M     181
C     165
U      58
a      48
E      39
X      22
b      12
Name: feedname, dtype: int64

In [9]:
df['feedname'][~pd.isnull(df['feedname'])].value_counts().to_csv('2017-feedname-article-counts.csv')

In [10]:
df['author'].value_counts().to_csv('2017-author-counts.csv')