In [2]:
import pandas as pd
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import NLTKWordTokenizer
import numpy as np
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
import json

In [3]:
def transform(s):
    s = s.lower()
    return [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)]

In [4]:
testfile =  open("new_test.json", "r")
trainfile = open("new_train.json", "r")
loaded_test = json.load(testfile)
loaded_train = json.load(trainfile)
traindf = pd.DataFrame(loaded_train, columns=['id', 'text', 'author'])
traindf['text'] = traindf.text.apply(lambda text: transform(text))
testdf = pd.DataFrame(loaded_test, columns=['id', 'text', 'author'])
testdf['text'] = testdf.text.apply(lambda text: transform(text))
traindf
testdf.to_json('tokenized_test.json', orient='records')
traindf.to_json('tokenized_train.json', orient='records')

In [5]:
len(traindf.explode('text')[['id', 'author', 'text']])

475449

In [6]:
train_data = traindf.groupby("author").agg(
    text=pd.NamedAgg(column="text", aggfunc=lambda x: x.explode()),
).reset_index()
train_data

Unnamed: 0,author,text
0,EAP,"[it, will, have, been, ,, by, that, time, ,, t..."
1,HPL,"[what, the, whole, proceeding, meant, ,, i, co..."
2,MWS,"[make, me, happy, ,, and, i, shall, again, be,..."


In [7]:

common_dictionary = Dictionary(train_data["text"])
common_corpus = [common_dictionary.doc2bow(text) for text in train_data["text"]]
common_corpus

[[(0, 2340),
  (1, 303),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 2),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 3),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 3),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 6),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 2),
  (67, 1),
  (68, 5),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 6),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 5),
  (87, 6),
  (88, 1),
  (89, 5),
  (90, 1),
  (9

In [8]:
from gensim.models.ldamulticore import LdaMulticore

lda = LdaMulticore(common_corpus, id2word=common_dictionary, num_topics=50)
lda.get_document_topics(common_corpus[0])

[(8, 0.8147896), (10, 0.037030254), (13, 0.078657605), (38, 0.06313503)]

In [9]:
common_theta = []
for doc in common_corpus:
    res = lda.get_document_topics(doc, minimum_probability=0, minimum_phi_value=0)
    temp = [0 for i in range(200)]
    for (x,y) in res:
        temp[x] += y
    common_theta.append(temp)
common_theta


[[0.001401084940880537,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  0.8161296248435974,
  1.114138399316289e-07,
  0.0302121601998806,
  1.114138399316289e-07,
  1.114138399316289e-07,
  0.09553585946559906,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  0.0028826126363128424,
  1.114138399316289e-07,
  5.7997549447463825e-05,
  2.475722794770263e-06,
  1.1849314432765823e-05,
  1.114138399316289e-07,
  1.114138399316289e-07,
  7.05721031408757e-05,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.7014344848576002e-05,
  1.114138399316289e-07,
  0.00015307588910218328,
  2.6668083592085168e-05,
  1.114138399316289e-07,
  1.114138399316289e-07,
  1.114138399316289e-07,
  0.00039791164454072714,
  1.114138399316289e-07,
  0.05288263410329819,
  1.114138399316289e-07,
  1.11

In [19]:
list(map(lambda x: x[0], list(filter(lambda _: randint(0,10)==0, enumerate(test_corpus)))))

[5,
 63,
 78,
 85,
 99,
 102,
 103,
 129,
 144,
 147,
 157,
 185,
 198,
 218,
 220,
 232,
 246,
 253,
 262,
 267,
 268,
 276,
 285,
 293,
 299,
 305,
 306,
 307,
 317,
 329,
 330,
 332,
 356,
 365,
 369,
 371,
 388,
 389,
 406,
 416,
 419,
 428,
 446,
 450,
 453,
 454,
 522,
 533,
 543,
 550,
 551,
 552,
 558,
 580,
 584,
 585,
 589,
 600,
 611,
 612,
 621,
 630,
 634,
 657,
 662,
 668,
 679,
 687,
 709,
 726,
 735,
 752,
 761,
 787,
 791,
 810,
 818,
 822,
 831,
 836,
 838,
 842,
 868,
 884,
 885,
 887,
 909,
 931,
 936,
 939,
 959,
 963,
 1000,
 1016,
 1017,
 1039,
 1067,
 1070,
 1086,
 1088,
 1094,
 1163,
 1166,
 1168,
 1173,
 1183,
 1186,
 1204,
 1226,
 1231,
 1252,
 1273,
 1277,
 1322,
 1330,
 1354,
 1356,
 1357,
 1385,
 1386,
 1392,
 1395,
 1407,
 1430,
 1458,
 1483,
 1484,
 1493,
 1498,
 1499,
 1508,
 1513,
 1517,
 1521,
 1525,
 1529,
 1531,
 1532,
 1535,
 1544,
 1552,
 1557,
 1559,
 1562,
 1575,
 1647,
 1655,
 1656,
 1668,
 1670,
 1679,
 1684,
 1693,
 1696,
 1705,
 1712,
 1715,

In [25]:
from collections import defaultdict
from gensim.matutils import hellinger
import math
from random import randint

test_corpus = [common_dictionary.doc2bow(text) for text in testdf["text"]]
predictions = []
test_doc_sample = list(filter(lambda _: randint(0,4)==0, enumerate(test_corpus)))
test_doc_sample_indices = list(map(lambda x: x[0], test_doc_sample))
filter(lambda _: randint(0,10)==0, enumerate(test_corpus[:200]))
for i, doc in test_doc_sample:
    if i % 20 == 0:
        print("iteration", i, "of", len(test_corpus))
    min = math.inf
    min_index = -1
    for j, doc2 in enumerate(common_corpus):
        dist = hellinger(lda[doc],
                    lda[doc2])
        if dist < min:
            min = dist
            min_index = j
    predictions.append(train_data["author"].iloc[min_index])
predictions


iteration 60 of 3916
iteration 80 of 3916
iteration 140 of 3916
iteration 260 of 3916
iteration 400 of 3916
iteration 740 of 3916
iteration 780 of 3916
iteration 900 of 3916
iteration 920 of 3916
iteration 1060 of 3916
iteration 1100 of 3916
iteration 1360 of 3916
iteration 1440 of 3916
iteration 1720 of 3916
iteration 1780 of 3916
iteration 1860 of 3916
iteration 1940 of 3916
iteration 2040 of 3916
iteration 2060 of 3916
iteration 2140 of 3916
iteration 2240 of 3916
iteration 2260 of 3916
iteration 2280 of 3916
iteration 2420 of 3916
iteration 2540 of 3916
iteration 2580 of 3916
iteration 2720 of 3916
iteration 2800 of 3916
iteration 2840 of 3916
iteration 2880 of 3916
iteration 2960 of 3916
iteration 3020 of 3916
iteration 3080 of 3916
iteration 3140 of 3916
iteration 3180 of 3916
iteration 3340 of 3916
iteration 3360 of 3916
iteration 3480 of 3916
iteration 3520 of 3916
iteration 3600 of 3916
iteration 3620 of 3916
iteration 3660 of 3916
iteration 3720 of 3916
iteration 3760 of 3916

['EAP',
 'HPL',
 'EAP',
 'HPL',
 'HPL',
 'MWS',
 'HPL',
 'MWS',
 'HPL',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'HPL',
 'MWS',
 'HPL',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'MWS',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'HPL',
 'HPL',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'MWS',
 'EAP',
 'HPL',
 'HPL',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'MWS',
 'MWS',
 'HPL',
 'EAP',
 'EAP',
 'MWS',
 'HPL',
 'HPL',
 'HPL',
 'HPL',
 'HPL',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'HPL',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'HPL',
 'MWS',
 'HPL',
 'EAP',
 'EAP',
 'HPL',
 'HPL',
 'MWS',
 'HPL',
 'HPL',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'HPL',
 'HPL',
 'HPL',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'HPL',
 'HPL',
 'EAP',
 'MWS',
 'MWS',
 'HPL',
 'HPL',
 'HPL',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'EAP',


In [26]:
predicdf = pd.DataFrame([testdf.iloc[i] for i in test_doc_sample_indices])
predicdf["predictions"] = predictions
predicdf
predicdf.groupby(["author", "predictions"]).size().unstack(fill_value=0)


predictions,EAP,HPL,MWS
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EAP,186,60,61
HPL,73,132,34
MWS,115,46,98


In [28]:
from sklearn.metrics import accuracy_score

accuracy_score(predicdf["author"], predicdf["predictions"])


ModuleNotFoundError: No module named 'sklearn'