In [36]:
#General
import numpy as np
import pandas as pd
import json
import csv
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#visualization
import pyLDAvis
import pyLDAvis.gensim_models

In [37]:

import os

path = "/Users/mirakotamaki/Desktop/python/miras_final_project2/tekstidata"

os.chdir(path)

def read_text_file(file_path):
	with open(file_path, 'r',encoding="iso-8859-15") as f:
		return f.read()

texts = [read_text_file(f"{path}/{file}") for file in os.listdir() if file.endswith(".txt")]
texts


['/gemBase\t\t\n/unit/@alt\t/unit/@id\t/unit\n\tu-1.01\tHelsinki City Tourist Office\nIllustration: Coat of Arms\tu-1.02\t\n\tu-1.03\tPohjoisesplanadi 19, 00100 Helsinki 10\n\tu-1.04\tphone 1623 217 and 174 088\n\tu-1.05\tCables: citytourist\nLine\tu-1.06\t\n\tu-1.07\tmeet the sea\n\tu-1.08\tBoat tours\n\tu-1.09\tIn the summer months there are daily motor-boat trips around Helsinki.\n\tu-1.10\tDepartures are from the Market Square, Pohjoisranta (North Harbour, the end of Aleksanterinkatu), and Hakaniemi Square.\n\tu-1.11\tOn some tours it is possible to get off at Suomenlinna or Korkeasaari.\n\tu-1.12\tTaxi boats can also be hired.\n\tu-1.13\tFishing in the Helsinki area\n\tu-1.14\tTwo permits are needed for fishing in Helsinki: a general permit, available in any post-office, and a local permit, issued by the City Sports Department (Urheilu- ja ulkoiluvirasto), Toinen linja 4 A, phone 716 011.\n\tu-1.15\tPRINTED IN FINLAND\n\tu-1.16\tYhteiskirjapaino Oy.\nIllustration: Four round symbo

In [38]:
stopwords = stopwords.words("english")

In [47]:
#lemmatization with spacy
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm")
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags and token.text not in stopwords:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(texts)
print (lemmatized_texts)

['/unit/@alt u-1.01 coat arm u-1.02 u-1.03 phone citytourist line u-1.06 meet sea tour summer month daily motor boat trip departure end u-1.11 tour possible get boat also hire fishing area permit need fish general permit available post - office local permit issue linja phone print round symbol line u-1.19 tourist island photo cardboard symbol u-1.21 u-2.01 illustration round symbol green line u-2.03 island national park open air museum little west centre reach footbridge museum part establish u-2.06 fully furnish farmhouse different part bring - erect complete outbuilding u-2.08 museum also contain wooden church 17th century service hold summer 18th century manor house antique furniture also workshop mill barn various kind church boat swing village store next item vicarage wind mill u-2.11 summer month outdoor performance folk dancing folk concert traditional popular u-2.13 activity organize also restaurant beach information kiosk museum shop open air museum bus no u-2.18 illustration 

In [49]:
#stopwords
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print (data_words)

[['unit', 'alt', 'coat', 'arm', 'phone', 'citytourist', 'line', 'meet', 'sea', 'tour', 'summer', 'month', 'daily', 'motor', 'boat', 'trip', 'departure', 'end', 'tour', 'possible', 'get', 'boat', 'also', 'hire', 'fishing', 'area', 'permit', 'need', 'fish', 'general', 'permit', 'available', 'post', 'office', 'local', 'permit', 'issue', 'linja', 'phone', 'print', 'round', 'symbol', 'line', 'tourist', 'island', 'photo', 'cardboard', 'symbol', 'illustration', 'round', 'symbol', 'green', 'line', 'island', 'national', 'park', 'open', 'air', 'museum', 'little', 'west', 'centre', 'reach', 'footbridge', 'museum', 'part', 'establish', 'fully', 'furnish', 'farmhouse', 'different', 'part', 'bring', 'erect', 'complete', 'outbuilding', 'museum', 'also', 'contain', 'wooden', 'church', 'th', 'century', 'service', 'hold', 'summer', 'th', 'century', 'manor', 'house', 'antique', 'furniture', 'also', 'workshop', 'mill', 'barn', 'various', 'kind', 'church', 'boat', 'swing', 'village', 'store', 'next', 'item

In [50]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)
    

word = id2word

In [75]:
#topic model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=13,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [76]:
#visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=8)
vis

  default_term_info = default_term_info.sort_values(
