In [7]:
import pandas as pd
from nltk.tokenize import NLTKWordTokenizer
from gensim.corpora.dictionary import Dictionary

In [8]:
df = pd.read_json("../preprocessing_output/preprocessed_train_L.json")
df.head()

Unnamed: 0,id,token,author
0,id10914,"[time, totally, destroy, three, different, per...",EAP
1,id03295,"[make, happy, shall, virtuous]",MWS
2,id04103,"[margin, river, many, dazzling, rivulet, glide...",EAP
3,id26189,"[heat, become, intolerable]",EAP
4,id07955,"[indeed, passion, young, girl, valley, compari...",EAP


In [9]:
train_data = df.groupby("author").agg(
    token=pd.NamedAgg(column="token", aggfunc=lambda x: x.explode().dropna()),
).reset_index()
train_data

Unnamed: 0,author,token
0,EAP,"[time, totally, destroy, three, different, per..."
1,HPL,"[whole, proceeding, meant, could, imagine, unl..."
2,MWS,"[make, happy, shall, virtuous, strange, incred..."


In [10]:
from gensim.corpora import Dictionary

vocab = Dictionary(train_data["token"])

2023-03-05 11:01:51,444 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-03-05 11:01:51,558 : INFO : built Dictionary<18408 unique tokens: ["'about", "'ah", "'alexander", "'all", "'although"]...> from 3 documents (total 205950 corpus positions)
2023-03-05 11:01:51,558 : INFO : Dictionary lifecycle event {'msg': 'built Dictionary<18408 unique tokens: ["\'about", "\'ah", "\'alexander", "\'all", "\'although"]...> from 3 documents (total 205950 corpus positions)', 'datetime': '2023-03-05T11:01:51.558693', 'gensim': '4.3.0', 'python': '3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'created'}


In [11]:
from gensim.models.ldamodel import LdaModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

num_topics = 200
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = vocab[0]  # This is only to "load" the dictionary.
id2word = vocab.id2token

train_corpus = [vocab.doc2bow(doc) for doc in train_data["token"]]
lda = LdaModel(
    corpus=train_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=1
)

2023-03-05 11:01:52,739 : INFO : using autotuned alpha, starting with [0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005

In [None]:
df = pd.read_json("../preprocessing_output/preprocessed_test_L.json")
df.head()

Unnamed: 0,id,token,author
0,id18154,"[first, indication, revival, afford, partial, ...",EAP
1,id22950,"[upon, recovery, felt, oh, inexpressibly, sick...",EAP
2,id25862,"[vast, polyphemus, like, loathsome, dart, like...",HPL
3,id21059,"[sit, near, raymond, tell, story, convulse, re...",MWS
4,id24764,"[sydney, bulletin, mention, friend, wide, affi...",HPL


In [None]:
test_data = df

In [None]:
import math
from random import randint
from gensim.matutils import hellinger


test_corpus = [vocab.doc2bow(doc) for doc in test_data["token"]]
predictions = []
for i, doc in enumerate(test_corpus):
    if i % 20 == 0:
        print("iteration", i, "of", len(test_corpus))
    min = math.inf
    min_index = -1
    for j, doc2 in enumerate(train_corpus):
        dist = hellinger(lda[doc],
                    lda[doc2])
        if dist < min:
            min = dist
            min_index = j
    predictions.append(train_data["author"].iloc[min_index])
predictions

iteration 20 of 3916
iteration 80 of 3916
iteration 120 of 3916
iteration 160 of 3916
iteration 400 of 3916
iteration 460 of 3916
iteration 500 of 3916
iteration 680 of 3916
iteration 860 of 3916
iteration 1060 of 3916
iteration 1100 of 3916
iteration 1120 of 3916
iteration 1140 of 3916
iteration 1160 of 3916
iteration 1340 of 3916
iteration 1500 of 3916
iteration 1580 of 3916
iteration 1600 of 3916
iteration 1920 of 3916
iteration 2200 of 3916
iteration 2360 of 3916
iteration 2380 of 3916
iteration 2560 of 3916
iteration 2740 of 3916
iteration 3040 of 3916
iteration 3120 of 3916
iteration 3300 of 3916
iteration 3460 of 3916
iteration 3480 of 3916
iteration 3740 of 3916
iteration 3760 of 3916
iteration 3800 of 3916
iteration 3900 of 3916


['HPL',
 'HPL',
 'EAP',
 'HPL',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'HPL',
 'MWS',
 'HPL',
 'HPL',
 'MWS',
 'HPL',
 'EAP',
 'HPL',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'HPL',
 'HPL',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'MWS',
 'HPL',
 'HPL',
 'EAP',
 'HPL',
 'EAP',
 'MWS',
 'EAP',
 'MWS',
 'EAP',
 'MWS',
 'HPL',
 'EAP',
 'MWS',
 'HPL',
 'EAP',
 'HPL',
 'HPL',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'MWS',
 'MWS',
 'HPL',
 'MWS',
 'EAP',
 'EAP',
 'HPL',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'HPL',
 'MWS',
 'HPL',
 'MWS',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'EAP',
 'HPL',
 'HPL',
 'HPL',
 'MWS',
 'EAP',
 'HPL',
 'HPL',
 'EAP',
 'HPL',
 'MWS',
 'HPL',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'HPL',
 'MWS',
 'EAP',
 'MWS',
 'HPL',
 'MWS',
 'MWS',
 'HPL',
 'MWS',
 'MWS',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'MWS',
 'EAP',


In [None]:
predicdf = pd.DataFrame([test_data.iloc[i] for i in test_doc_sample_indices])
predicdf["predictions"] = predictions
predicdf
predicdf.groupby(["author", "predictions"]).size().unstack(fill_value=0)


predictions,EAP,HPL,MWS
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EAP,222,39,31
HPL,26,189,15
MWS,16,20,207


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predicdf["author"], predicdf["predictions"])

0.807843137254902

In [None]:
predicdf

NameError: name 'predicdf' is not defined