In [None]:
import pathlib

SCRIPT_PATH = pathlib.Path.cwd()

OUTPUT_PATH = SCRIPT_PATH.joinpath('part2a_output')
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

INPUT_PATH = SCRIPT_PATH.joinpath('part2a_input')
INPUT_PATH.mkdir(parents=True, exist_ok=True)

# MALLET_PATH = SCRIPT_PATH.joinpath('mallet/bin/mallet')
MALLET_PATH = SCRIPT_PATH.joinpath('/srv/mallet/bin/mallet')

print('Script path: {}'.format(str(SCRIPT_PATH)))
print('Output path: {}'.format(str(OUTPUT_PATH)))
print('Input path: {}'.format(str(INPUT_PATH)))
print('MALLET path: {}'.format(str(MALLET_PATH)))

In [None]:
# https://pypi.org/project/Wikipedia-API/

from urllib import request
from bs4 import BeautifulSoup

URL = "https://en.wikipedia.org/api/rest_v1/page/html/List_of_fields_of_doctoral_studies_in_the_United_States"

####################################################################

links_source_html = ''

with request.urlopen(URL) as f:
    links_source_html = BeautifulSoup(f, 'html.parser')
    
source_links = [source_link.get('href') 
                for source_link in links_source_html.select('.mw-redirect') 
                if ('#' not in source_link.get('href')
                    and ('List_of_fields_of_doctoral_studies_in_the_United_States' not in source_link.get('href'))
                    and (source_link.get('href').startswith('./')))]

source_links

In [None]:
source_links = source_links[2:-3]
source_links = set(source_links)
source_links = [source_link[2:] for source_link in source_links]

len(source_links)

In [None]:
source_links

In [None]:
API_ROOT_URL = "https://en.wikipedia.org/api/rest_v1/page/html/"

for source_link in source_links:
    paragraphs = []
    with request.urlopen(API_ROOT_URL + source_link) as f:
        target_html = BeautifulSoup(f, 'html.parser')
        for paragraph in target_html.select('p'):
            word_count = len(paragraph.get_text().split())
            if word_count < 5:
                continue
            else:
                paragraphs.append(paragraph.get_text())
                
    text_path = INPUT_PATH.joinpath(source_link + '.txt')
    with open(text_path, 'w+') as f:
        f.write(' '.join(paragraphs))
        print('Saved to {}.'.format(text_path.parts[-1]))

In [None]:
input_text_filenames = [str(file)
                        for file in INPUT_PATH.iterdir()
                        if file.suffix == '.txt']
    
input_text_filenames

In [None]:
import gensim
from nltk.corpus import stopwords


data = []
stopwords = stopwords.words('english')

for input_text_filename in input_text_filenames:
    with open(input_text_filename, 'r') as f:
        words = gensim.utils.simple_preprocess(f.read(), deacc=True)
        words = [word for word in words if word not in stopwords]
        data.append(words)

len(data)

In [None]:
data[1]

In [None]:
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(document) for document in data]

In [None]:
model = gensim.models.wrappers.LdaMallet(mallet_path=str(MALLET_PATH), 
                                         corpus=corpus,
                                         num_topics=20,
                                         id2word=id2word,
                                         optimize_interval=10,
                                         iterations=100,
                                         random_seed=20190923)

In [None]:
model.show_topics(num_topics=20, num_words=10)

In [None]:
model.save(str(OUTPUT_PATH.joinpath('mallet_output')))

In [None]:
model.read_doctopics()

In [None]:
document_topics = list(model.get_topics())
document_topics

In [None]:
model.ftopickeys()

In [None]:
! cat /var/folders/80/2t7ffq3n05n2jcz8p2500h0m0000gn/T/caf971_topickeys.txt