In [22]:
from pathlib import Path
import sqlite3
from collections import namedtuple
import requests
import json

import numpy as np
import sklearn
import pandas as pd
from pandas import Int64Index

from secrets.config import config

In [38]:
Article = namedtuple('Article', ['article_id', 'source_id', 'headline', 'excerpt', 'full_text', 'image_url', 'article_url'])

path_data = Path('sql')
path_data.mkdir(exist_ok=True)
db_file = path_data / 'db.sqlite'
with sqlite3.connect(str(db_file)) as con:
    cur = con.cursor()
    cur.execute('select article_id, source_id, headline, excerpt, full_text, image_url, article_url from article')
    articles = cur.fetchall()
    articles = [Article(*a) for a in articles]
articles = [a for a in articles if len(a.full_text) > 1]
print(len(articles))
print(articles[0].headline)

128
Boris throws down the gauntlet to May over Brexit


In [35]:
def get_response(url: str, token, text) -> str:
    headers = {'X-AG-Access-Token' : token, 'Content-Type' :  'text/raw', 'outputformat' : 'application/json'}
    try:
        response = requests.post(url, data=text, headers=headers)
        if response.ok:
            return response
        else:
            raise Exception('invalid response code', response)
    except Exception as e:
        raise e

token = config['tagger_token']

def get_dict(token, a):
    response = get_response('https://api.thomsonreuters.com/permid/calais', token, 
                            a)
    content = response.text.encode('utf-8')
    return json.loads(content)

tags = [get_dict(token, a.full_text.encode('utf-8')) 
        for a in articles]
tags[0]

{'doc': {'info': {'calaisRequestID': '21392d7b-5a68-884c-15e8-d6164c822741',
   'docDate': '2017-09-17 01:08:44.675',
   'docId': 'http://d.opencalais.com/dochash-1/72003004-56a0-30f3-9bd5-8c4c52304c87',
   'docTitle': '',
   'id': 'http://id.opencalais.com/hoKTMyubZGNNS6f6wgl4wA',
   'ontology': 'http://mdaas-virtual-onecalais.int.thomsonreuters.com/owlschema/11.0/onecalais.owl.allmetadata.xml'},
  'meta': {'contentType': 'text/raw',
   'language': 'English',
   'processingVer': 'AllMetadata',
   'serverVersion': 'OneCalais_11.0-RELEASE:169',
   'signature': 'digestalg-1|TTNgisai0f8GMEVadc6CTvlIPO8=|MOE9vesMKT6z3hubteCioEir3ZZw9sA/sRyC02aMXABVHjymtVy42A==',
   'stagsVer': 'OneCalais_11.0-RELEASE-b10-2017-07-31_12:36:53',
   'submissionDate': '2017-09-17 01:08:43.848',
   'submitterCode': '2362b867-7324-6cab-fb2e-2a48761bfb77'}},
 'http://d.opencalais.com/comphash-1/35f4ba16-782b-3ca0-8d1e-91ff16d3b99f': {'_type': 'Company',
  '_typeGroup': 'entities',
  '_typeReference': 'http://s.ope

In [110]:
def get_relevant(tag):
    keys = tag.keys()
    keys = [k for k in keys if k.startswith('http')]

    filtered = [tag[k] 
                for k in keys
                if 'relevance' in tag[k] and 'name' in tag[k]]
    return [(f['name'], f['relevance']) for f in filtered]

cleaned = [get_relevant(t) for t in tags]
cleaned[0]

[('Newspaper Publishing', 0),
 ('Broadcasting - NEC', 0),
 ('Newspaper & Magazine Printing Services', 0),
 ('Archbishop', 0.2),
 ('Prime Minister', 0.8),
 ('foreign secretary', 0.2),
 ('the Daily Telegraph', 0.2),
 ('Canterbury', 0.2),
 ('BBC', 0),
 ('journalist', 0.2),
 ('stroke', 0.2),
 ('looking forward', 0.2),
 ('senior Tory MP', 0.2),
 ('Kevin Schofield Tweeted', 0.2),
 ('Chancellor', 0.2),
 ('United Kingdom', 0.2),
 ('senior diplomat', 0.2),
 ('European Union', 0.2),
 ('Boris Johnson', 0.8),
 ('Barack Obama', 0.2),
 ('Sky News', 0),
 ('NHS', 0.8),
 ('Brussels', 0.2),
 ('Politics Home editor', 0.2),
 ('Bill', 0.2),
 ('Philip Hammond', 0.2),
 ('Theresa May', 0.2),
 ('the Daily Telegraph', 0),
 ('Florence', 0.2),
 ('Britain', 0.2)]

In [111]:
cleaned = [(a.article_id, t) for a, t in zip(articles, cleaned)]
cleaned[0]

(1,
 [('Newspaper Publishing', 0),
  ('Broadcasting - NEC', 0),
  ('Newspaper & Magazine Printing Services', 0),
  ('Archbishop', 0.2),
  ('Prime Minister', 0.8),
  ('foreign secretary', 0.2),
  ('the Daily Telegraph', 0.2),
  ('Canterbury', 0.2),
  ('BBC', 0),
  ('journalist', 0.2),
  ('stroke', 0.2),
  ('looking forward', 0.2),
  ('senior Tory MP', 0.2),
  ('Kevin Schofield Tweeted', 0.2),
  ('Chancellor', 0.2),
  ('United Kingdom', 0.2),
  ('senior diplomat', 0.2),
  ('European Union', 0.2),
  ('Boris Johnson', 0.8),
  ('Barack Obama', 0.2),
  ('Sky News', 0),
  ('NHS', 0.8),
  ('Brussels', 0.2),
  ('Politics Home editor', 0.2),
  ('Bill', 0.2),
  ('Philip Hammond', 0.2),
  ('Theresa May', 0.2),
  ('the Daily Telegraph', 0),
  ('Florence', 0.2),
  ('Britain', 0.2)])

In [112]:
cleaned = [(a, [x for x in t if x[1] > 0.5]) 
           for a, t in cleaned]
cleaned = [(a, t) for a, t in cleaned if len(t) > 0]
cleaned[0]

(1, [('Prime Minister', 0.8), ('Boris Johnson', 0.8), ('NHS', 0.8)])

In [113]:
# def calc_similarity(d1, d2):
#     k1, v1 = list(zip(*d1))
#     k2, v2 = list(zip(*d2))
    
#     max_sim = len(set(k1 + k2))
#     shared_keys = set(k1).intersection(k2)
#     if len(shared_keys) == 0:
#         return 0

#     x = [(v1[k1.index(sk)] - v2[k2.index(sk)]) for sk in shared_keys]
#     x = np.abs(x)
#     x = [1 - y for y in x]
#     x = sum(x)
#     return x / max_sim

# # calc_similarity(cleaned[0][1], cleaned[1][1])

# sims = [(a1[0], a2[0], calc_similarity(a1[1], a2[1]))
#         for a1 in cleaned
#         for a2 in cleaned
#         if a1[0] > a2[0]]
# sims[:20]

In [114]:
def calc_similarity(d1, d2):
    k1, v1 = list(zip(*d1))
    k2, v2 = list(zip(*d2))
    
    max_sim = len(set(k1 + k2))
    shared_keys = set(k1).intersection(k2)
    if len(shared_keys) == 0:
        return 0

    x = [(v1[k1.index(sk)] - v2[k2.index(sk)]) for sk in shared_keys]
    x = np.abs(x)
    x = [1 - y for y in x]
    x = sum(x)
    return x / max_sim

# calc_similarity(cleaned[0][1], cleaned[1][1])

sims = [(a1[0], a2[0], calc_similarity(a1[1], a2[1]))
        for a1 in cleaned
        for a2 in cleaned
        if a1[0] > a2[0]]
sims[:20]

[(2, 1, 0),
 (3, 1, 0),
 (3, 2, 0),
 (4, 1, 0),
 (4, 2, 0),
 (4, 3, 0.20000000000000001),
 (5, 1, 0),
 (5, 2, 0),
 (5, 3, 0),
 (5, 4, 0),
 (6, 1, 0),
 (6, 2, 0),
 (6, 3, 0),
 (6, 4, 0),
 (6, 5, 0),
 (7, 1, 0),
 (7, 2, 0),
 (7, 3, 0),
 (7, 4, 0),
 (7, 5, 0)]

In [115]:
def clear_table(cur):
    cur.execute(f'UPDATE similarities SET permid = 0')

def update_db(cur, id_1, id_2, value):
    cur.execute(f'UPDATE similarities SET permid = ? WHERE article_id_1 = ? AND article_id_2 = ?', 
                (value, id_1, id_2))

with sqlite3.connect(str(db_file)) as con:
    cur = con.cursor()

    clear_table(cur)
    con.commit()

    [update_db(cur, id_1, id_2, value) 
     for id_1, id_2, value in sims
     if id_1 > id_2]
    con.commit()