In [170]:
import json
import itertools
from collections import Counter
import numpy as np
from scipy.stats import zscore
from sklearn.metrics.pairwise import manhattan_distances

In [171]:
data = json.load(open("data2178datesfound.json"))
data = [d for d in data if d["author_string"]]
data = [d for d in data if len(d["text"].split()) > 100]

In [172]:
sourceAuthorPairs = list(set([d["source"] + " - " + d["author_string"] for d in data]))

In [173]:
len(sourceAuthorPairs)

1071

In [174]:
sourceAuthorsWordCount = {
    s:
    sum([len(d["text"].split())
         for d in data
         if d["source"] == s.split(" - ")[0] and
         d["author_string"] == s.split(" - ")[1]])
    for s in sourceAuthorPairs}

In [175]:
len([sawc for sawc in sourceAuthorsWordCount if sourceAuthorsWordCount[sawc] > 500])

837

In [176]:
labelledData = {
    s:
    " ".join([d["text"]
              for d in data
              if d["source"] == s.split(" - ")[0] and
              d["author_string"] == s.split(" - ")[1]])
    for s in sourceAuthorPairs if sourceAuthorsWordCount[s] > 500}

In [177]:
len(labelledData)

# labelledData["bostonglobe - adam liptak"]

837

In [35]:
n = 3 # n for n-gram
vsm = [] # output vector space matrix
columnHeaders = list(labelledData.keys()) # documents as columns of vsm
rowHeaders = [] # terms/chargrams as rows of vsm 

for label in labelledData:
    cid = columnHeaders.index(label) # column index of target output cell in vsm
    text = labelledData[label] # text in a document
    charngrams = [text[i:i+n] for i in range(len(text)-n+1)] # list all char n-grams in text
    for cgram in charngrams:
        if cgram not in rowHeaders: # new char ngram encountered, create new row
            rowHeaders.append(cgram)
            vsm.append([0]*len(columnHeaders))
        rid = rowHeaders.index(cgram) # row index of target output cell in vsm
        vsm[rid][cid] += 1 # increment count of target vsm cell
            

In [178]:
docTags = columnHeaders
termTags = rowHeaders
len(vsm)

30988

In [179]:
countMatrix = np.array([np.array(vsmi) for vsmi in vsm]) # convert to ndarray
countMatrix.shape

(30988, 837)

In [180]:
# total count of each term t in entire corpus
totalCounts = countMatrix.sum(axis=0)

# select number of features based on 90th percentile of total counts
nFeatures = np.argwhere(totalCounts > np.percentile(totalCounts, 90)).shape[0]

# get index of termTags sorted in descending order of total count in corpus
topChargramIndex = np.argsort(countMatrix.sum(axis=0))[::-1]

# get index in termTags for top n features
featureIndex = topChargramIndex[:nFeatures]

# keep original counts for backup as their computation was expensive
X = countMatrix.copy()

# only keep top n features
X = X[featureIndex, :]

# convert counts to relative frequencies
X = X/X.sum(axis=0, keepdims=True)

# Transpose: resulting in -> documents on rows, terms on columns
X = X.T

# scale X by z score (x - mu)/sigma
scaledX = zscore(X, axis=0, ddof=1)

# row-normalize X with l1 norm
row_sums = X.sum(axis=1)
X = X / row_sums[:, np.newaxis]

# compute pairwise delta as L1 dist. between normalized and scaled X
delta = manhattan_distances(X)

In [181]:
authors = list(set([dt.split(" - ")[1] for dt in docTags]))
authorGroups = {a: [dt for dt in docTags if a == dt.split(" - ")[1]] for a in authors}
authorGroupIndex = {a: [docTags.index(dt) for dt in authorgroups[a]] for a in authors}
print(dict(Counter(map(len, authorgroups.values()))))
print(len([dt for dt in docTags if 1 < len(authorgroups[dt.split(" - ")[1]])]))

{1: 706, 2: 55, 3: 7}
131


In [182]:
result = {}
for a in authorGroupIndex:
    if len(authorGroupIndex[a]) > 1:
        result[a] = {
            "sources": [dt.split(" - ")[0] for dt in docTags if dt.split(" - ")[1] == a],
            "mean delta": np.mean(list(map(lambda x: delta[x[0]][x[1]],
                                           itertools.combinations(authorGroupIndex[a], 2))))
        }
result

{'aaron blake': {'mean delta': 0.45412822370650829,
  'sources': ['washingtonpost', 'chicagotribune', 'bostonglobe']},
 'abha bhattarai': {'mean delta': 0.46293856346935702,
  'sources': ['washingtonpost', 'chicagotribune']},
 'adam liptak': {'mean delta': 0.2947331523332441,
  'sources': ['bostonglobe', 'nytimes']},
 'adam liptak AND michael d. shear': {'mean delta': 0.20388265447492387,
  'sources': ['nytimes', 'bostonglobe']},
 'adam liptak AND peter baker': {'mean delta': 0.27329769927957237,
  'sources': ['nytimes', 'bostonglobe']},
 'alan feuer AND michael d. shear AND nicholas kulish': {'mean delta': 0.19727597682731426,
  'sources': ['bostonglobe', 'nytimes']},
 'alanna durkin richer': {'mean delta': 0.2676100628930817,
  'sources': ['chicagotribune', 'bostonglobe']},
 'alexandra zavis': {'mean delta': 0.40934684684684708,
  'sources': ['latimes', 'chicagotribune']},
 'alicia a. caldwell AND elliot spagat': {'mean delta': 0.21388939698798867,
  'sources': ['bostonglobe', 'chica