In [None]:
# we need NLTK
! pip install nltk

In [None]:
# we need to download some data from nltk
import nltk
# a gui screen will open to download relevant stuff
# 
#nltk.download('punkt') # 'punkt' 'stopwords'
#nltk.download('stopwords') # 'punkt' 'stopwords' 'wordnet' 'omw-1.4'
#nltk.download()

In [None]:
from os.path import isfile, join
from os import listdir

from nltk.corpus.reader import PlaintextCorpusReader
from nltk.corpus import stopwords

corpus_root = "./txt/"
file_ext = "txt"
file_ids = [f for f in listdir(corpus_root) if isfile(join(corpus_root, f)) and f.lower().endswith(file_ext)]
corpus = PlaintextCorpusReader(corpus_root, file_ids)
print("The number of documents:", len(corpus.fileids()))
print("The number of sentences =", len(corpus.sents()))
print("The number of words =", len([word for sentence in corpus.sents() for word in sentence]))
#print("The number of characters =", len([char for sentence in corpus.sents() for word in sentence for char in word]))

In [None]:
corpus.sents()

In [None]:
# save the corpus sentences
import pickle

pickleFile = open('corpus_sentences.pkl', 'wb')
pickle.dump(corpus.sents(), pickleFile)
pickleFile.close()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

count_vect = CountVectorizer(max_df=2)
# term document matrix (more efficient for large corpora)
term_document_matrix = count_vect.fit_transform([corpus.raw(i) for i in file_ids])
df_dtm = pd.DataFrame(term_document_matrix.toarray(), columns=count_vect.get_feature_names_out())
df_dtm['file_ids'] = file_ids
df_dtm=df_dtm.set_index('file_ids')
df_dtm

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# takes some time
df = pd.DataFrame(columns=['Text'])
df['text'] = [corpus.raw(i) for i in file_ids]
df['file_ids'] = file_ids

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
# optional lemanize
lemmatizer = WordNetLemmatizer()
# optional stemmer
stemmer = PorterStemmer() 

all_stopwords = stopwords.words('english') + stopwords.words('dutch')

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in all_stopwords]
    # we need dutch tokenizers
    #stem_words=[stemmer.stem(w) for w in filtered_words]
    #lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df['clean_text'] = df['text'].map(lambda s:preprocess(s)) 

In [None]:
#stopwords.words('dutch')
#stopwords.words('english')
#all_stopwords = stopwords.words('english') + stopwords.words('dutch')

In [None]:
# save tmp result
df.to_pickle("wob_data.pkl")

In [None]:
# RESTART here
df = pd.read_pickle("wob_data.pkl")

In [None]:
corpus_clean = df[['file_ids','clean_text']]

In [None]:
corpus_clean['clean_text'][0]

In [None]:
freq = nltk.FreqDist(' '.join(corpus_clean['clean_text']).split())
topWords = freq.most_common(20)
topWords

In [None]:
# topic model
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
%matplotlib inline

In [None]:
from sklearn.feature_extraction import _stop_words
# max_features limits the number of features to use
vect = CountVectorizer(max_features=1000,ngram_range=(1,1),stop_words=['engilsh','dutch'])

In [None]:
# build a document term matrix
dtm=vect.fit_transform(corpus_clean['clean_text'])

In [None]:
# document term matrix
dtm

In [None]:
# how many topics do we want to find
lda=LatentDirichletAllocation(n_components=35)

In [None]:
# fit the model
lda.fit_transform(dtm)

In [None]:
#!pip install pyLDAvis

In [None]:
# vizualization
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
zit=pyLDAvis.sklearn.prepare(lda,dtm,vect)

In [None]:
pyLDAvis.display(zit)