## 0. Preprocessing

Files in the package:

- get_corpus_titles.py (Follow links from the page "Cognitive Science" to get the page titles)
- titles_to_json.py (I first pickled the titles list, but then decided to use json for transparency)
- read_wikidump.py (Process wikipedia dump and get nouns for articles in titles file)
- wikireader.py (custom ContentHandler implementation for xml.sax)


Corpus files and model (compressed sizes: corpus 83 Mb, model 32 Mb):

https://drive.google.com/drive/folders/1PFi-n-vCxsyd3YQZ_sBstgGLzM4VGEsL?usp=sharing

## 1. Import corpus

In [2]:
import os
import json
from tqdm import tqdm

titles = []
corpus_raw = []
n_docs = 59335

progress = tqdm(total=n_docs)
progress.set_description('Importing corpus\t')

corpus_file = os.path.join('corpus_redirects', 'corpus_fulltext.json')

with open(corpus_file, 'r') as f:
    while True:
        jdoc = f.readline()
        if not jdoc:
            break
        
        doc = json.loads(jdoc)
        titles.append(doc['page'])
        corpus_raw.append(doc['text'].split(' '))
        progress.update(1)
        
progress.close()

Importing corpus	: 100%|██████████| 59335/59335 [00:06<00:00, 8893.26it/s] 


In [5]:
corpus_raw[0][:10]

['name',
 'precision',
 'coordinate',
 'time',
 'standard',
 'passage',
 'time',
 'geoid',
 'realisation',
 'offset']

## 2. Count word frequences and keep only words that appear more than once

In [3]:
from collections import defaultdict

# Count word frequencies
progress = tqdm(total=n_docs)
progress.set_description('Count word frequences\t')

freq = defaultdict(int)
for doc in corpus_raw:
    for tok in doc:
        freq[tok] += 1
    progress.update(1)
progress.close()

# Only keep words that appear more than once and are 2 or more chars long
progress = tqdm(total=n_docs)
progress.set_description('Remove single words\t')

corpus = []
for doc in corpus_raw:
    corpus.append([tok for tok in doc if freq[tok] > 1 and len(tok) > 1])
    progress.update(1)
progress.close()

corpus[0][:10]

Count word frequences	: 100%|██████████| 59335/59335 [00:11<00:00, 5089.28it/s]
Remove single words	: 100%|██████████| 59335/59335 [00:11<00:00, 5386.56it/s]


['name',
 'precision',
 'coordinate',
 'time',
 'standard',
 'passage',
 'time',
 'geoid',
 'realisation',
 'offset']

## 3. Build gensim dictionary and BOW

In [4]:
import gensim

# WARNING: SLOW!
#dictionary = gensim.corpora.Dictionary(corpus)
#dictionary.save("dictionary.gensim")

dictionary = gensim.corpora.Dictionary.load("dictionary.gensim")
bow = [dictionary.doc2bow(doc) for doc in tqdm(corpus)]

100%|██████████| 59335/59335 [00:19<00:00, 3089.37it/s]


In [4]:
print(bow[0])

[(0, 1), (1, 2), (2, 1), (3, 3), (4, 1), (5, 1), (6, 3), (7, 2), (8, 3), (9, 2), (10, 5), (11, 1), (12, 2), (13, 2), (14, 18), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 3), (27, 1), (28, 1), (29, 2), (30, 3), (31, 1), (32, 1), (33, 6), (34, 3), (35, 2), (36, 1), (37, 1), (38, 2), (39, 3), (40, 1), (41, 1), (42, 4), (43, 1), (44, 2), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 3), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 2), (64, 1), (65, 1), (66, 3), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 23), (73, 2), (74, 12), (75, 1), (76, 1), (77, 1), (78, 4), (79, 1), (80, 2), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 29), (88, 1), (89, 1), (90, 2), (91, 1), (92, 1), (93, 1)]


## 4. Train and save the model

In [6]:
# WARNING: SLOW!
#topic_model = gensim.models.LdaModel(bow, id2word=dictionary, num_topics=30, passes=10)

In [9]:
#topic_model.save("model.gensim")

## 5. Test the model 

In [5]:
# Load model
from gensim.models.ldamodel import LdaModel
tm = LdaModel.load("topic_model.gensim")

for t in tm.show_topics(30):
    print(t)

(0, '0.028*"design" + 0.020*"system" + 0.013*"power" + 0.012*"vehicle" + 0.012*"control" + 0.011*"car" + 0.010*"machine" + 0.009*"engine" + 0.007*"use" + 0.007*"technology"')
(1, '0.043*"animal" + 0.027*"specie" + 0.014*"human" + 0.012*"evolution" + 0.011*"bird" + 0.011*"fish" + 0.010*"population" + 0.009*"organism" + 0.008*"gene" + 0.008*"group"')
(2, '0.025*"century" + 0.014*"year" + 0.011*"day" + 0.010*"time" + 0.010*"period" + 0.008*"culture" + 0.008*"stone" + 0.008*"art" + 0.008*"site" + 0.007*"god"')
(3, '0.020*"state" + 0.019*"century" + 0.015*"country" + 0.014*"people" + 0.014*"government" + 0.013*"power" + 0.012*"p." + 0.011*"history" + 0.011*"population" + 0.010*"war"')
(4, '0.017*"philosophy" + 0.013*"religion" + 0.011*"world" + 0.010*"life" + 0.009*"nature" + 0.009*"view" + 0.009*"idea" + 0.008*"form" + 0.008*"philosopher" + 0.008*"belief"')
(5, '0.059*"film" + 0.020*"art" + 0.014*"series" + 0.013*"medium" + 0.012*"story" + 0.012*"character" + 0.010*"television" + 0.009*"ar

In [7]:
# Find topics for a document in the corpus 
doc_id = 10 

# Print topics 
doc_topics = tm.get_document_topics(bow[doc_id])
doc_topics.sort(key=lambda i: i[1], reverse=True)
print(doc_topics)

print("\nTop 3 topics:")
for t, _ in doc_topics[:3]:
    print(f"{t}: {topic_map[t]}") 
    
print()

# Print the document title and a text sample 
print(titles[doc_id] + '\n')
print(' '.join(corpus_raw[doc_id][:50]))

[(8, 0.55607367), (29, 0.22885469), (27, 0.12050904), (0, 0.05737104), (22, 0.02335535)]

Top 3 topics:
8: Software development
29: Computers
27: Language

ASCII

ascii character standard communication ascii code text computer telecommunication equipment device character encoding scheme character internet name character encoding ascii milestone ascii telegraph code use bit teleprinter code datum service work standard meeting x3.2 subcommittee edition standard revision update telegraph code code sorting alphabetization list feature device teleprinter alphabet encode


In [6]:
# Some topic title quesses based on the data above (for demonstration purposes)
topic_map = {
    0: 'Engineering',
    1: 'Ecology, evolution',
    2: 'History',
    3: 'State, history',
    4: 'Philosophy, religion',
    5: 'Film, TV',
    6: 'Medicine',
    7: 'Politics',
    8: 'Software development',
    9: 'Gender',
    10: 'Biology',
    11: 'Statistics',
    12: 'Music',
    13: 'Physics',
    14: 'Climate',
    15: 'Children, parents?',
    16: 'Nutrition',
    17: 'Books, articles',
    18: 'Research',
    19: 'Video games',
    20: 'Psychology, brain',
    21: 'Economics',
    22: 'Organizations?',
    23: 'Cities and towns',
    24: 'Chemistry',
    25: 'War',
    26: 'Education',
    27: 'Language',
    28: 'Mathematics',
    29: 'Computers'
}