In [1]:
import pandas as pd
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import NLTKWordTokenizer
import numpy as np
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
import json

In [2]:
def transform(s):
    s = s.lower()
    return [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)]

In [3]:
testfile =  open("new_test.json", "r")
trainfile = open("new_train.json", "r")
loaded_test = json.load(testfile)
loaded_train = json.load(trainfile)
traindf = pd.DataFrame(loaded_train, columns=['id', 'text', 'author'])
traindf['text'] = traindf.text.apply(lambda text: transform(text))
testdf = pd.DataFrame(loaded_test, columns=['id', 'text', 'author'])
testdf['text'] = testdf.text.apply(lambda text: transform(text))
traindf
testdf.to_json('tokenized_test.json', orient='records')
traindf.to_json('tokenized_train.json', orient='records')


In [4]:
len(traindf.explode('text')[['id', 'author', 'text']])

475449

In [5]:
etdf = traindf.explode('text')[['id', 'author', 'text']]
etdf

Unnamed: 0,id,author,text
0,id10914,EAP,it
0,id10914,EAP,will
0,id10914,EAP,have
0,id10914,EAP,been
0,id10914,EAP,","
...,...,...,...
15662,id27509,MWS,never
15662,id27509,MWS,in
15662,id27509,MWS,better
15662,id27509,MWS,health


In [8]:
common_dictionary = Dictionary(traindf["text"])
common_corpus = [common_dictionary.doc2bow(text) for text in traindf["text"]]
common_corpus

[[(0, 4),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1)],
 [(0, 1),
  (1, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1)],
 [(0, 16),
  (1, 1),
  (2, 1),
  (4, 1),
  (9, 1),
  (12, 4),
  (19, 4),
  (27, 1),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 2),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 2),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 8),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 2),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  

In [13]:
from gensim.models.ldamulticore import LdaMulticore

lda = LdaMulticore(common_corpus, id2word=common_dictionary, num_topics=50)
lda.get_document_topics(common_corpus[0])

[(30, 0.06359295), (47, 0.35182008), (48, 0.5435907)]

In [24]:
common_theta = []
for doc in common_corpus:
    res = lda.get_document_topics(doc, minimum_probability=0, minimum_phi_value=0)
    temp = [0 for i in range(50)]
    for (x,y) in res:
        temp[x] += y
    common_theta.append(temp)
common_theta


[[0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.06474904716014862,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  0.0008722911588847637,
  

In [23]:
test_corpus = [common_dictionary.doc2bow(text) for text in testdf["text"]]
test_theta = []
for doc in test_corpus:
    res = lda.get_document_topics(doc, minimum_probability=0, minimum_phi_value=0)
    temp = [0 for i in range(50)]
    for (x,y) in res:
        temp[x] += y
    test_theta.append(temp)
sum(test_theta[0])

1.000000003958121

In [48]:
from collections import defaultdict
from gensim.matutils import hellinger

author_indices = traindf.groupby("author").indices
predictions = []
for i in range(1000):
    if i % 20 == 0:
        print("iteration", i,"of", len(test_corpus))
    authors = defaultdict(lambda: 0)
    author_count = defaultdict(lambda:0)
    for key, indices in author_indices.items():
        authors[key] = sum([hellinger(test_theta[i], common_theta[t]) for t in indices]) / float(len(indices))
    predictions.append(min(authors, key=authors.get))
predictions
        



iteration 0 of 3916
iteration 20 of 3916
iteration 40 of 3916
iteration 60 of 3916
iteration 80 of 3916
iteration 100 of 3916
iteration 120 of 3916
iteration 140 of 3916
iteration 160 of 3916
iteration 180 of 3916
iteration 200 of 3916
iteration 220 of 3916
iteration 240 of 3916
iteration 260 of 3916
iteration 280 of 3916
iteration 300 of 3916
iteration 320 of 3916
iteration 340 of 3916
iteration 360 of 3916
iteration 380 of 3916
iteration 400 of 3916
iteration 420 of 3916
iteration 440 of 3916
iteration 460 of 3916
iteration 480 of 3916
iteration 500 of 3916
iteration 520 of 3916
iteration 540 of 3916
iteration 560 of 3916
iteration 580 of 3916
iteration 600 of 3916
iteration 620 of 3916
iteration 640 of 3916
iteration 660 of 3916
iteration 680 of 3916
iteration 700 of 3916
iteration 720 of 3916
iteration 740 of 3916
iteration 760 of 3916
iteration 780 of 3916
iteration 800 of 3916
iteration 820 of 3916
iteration 840 of 3916
iteration 860 of 3916
iteration 880 of 3916
iteration 900 of

['EAP',
 'MWS',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'MWS',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'MWS',
 'EAP',
 'HPL',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'MWS',
 'HPL',
 'MWS',
 'EAP',
 'MWS',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'MWS',
 'EAP',
 'HPL',
 'EAP',
 'MWS',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'HPL',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'HPL',
 'HPL',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'HPL',


In [49]:
predicdf = pd.DataFrame(testdf.iloc[0:1000])
predicdf["predictions"] = predictions
predicdf.groupby(["author", "predictions"]).size().unstack(fill_value=0)


predictions,EAP,HPL,MWS
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EAP,264,56,87
HPL,135,77,67
MWS,153,37,124
