In [None]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from stopwords import get_stopwords
import string

In [None]:
additional_stopwords = [
    "aan","aangaande","aangezien","achte","achter","achterna","af","afgelopen","al","aldaar","aldus","alhoewel","alias","alle",
    "allebei","alleen","alles","als","alsnog","altijd","altoos","ander","andere","anders","anderszins","beetje","behalve","behoudens",
    "beide","beiden","ben","beneden","bent","bepaald","betreffende","bij","bijna","bijv","binnen","binnenin","blijkbaar","blijken","boven",
    "bovenal","bovendien","bovengenoemd","bovenstaand","bovenvermeld","buiten","bv","daar","daardoor","daarheen","daarin","daarna","daarnet",
    "daarom","daarop","daaruit","daarvanlangs","dan","dat","de","deden","deed","der","derde","derhalve","dertig","deze","dhr","die","dikwijls",
    "dit","doch","doe","doen","doet","door","doorgaand","drie","duizend","dus","echter","een","eens","eer","eerdat","eerder","eerlang","eerst",
    "eerste","eigen","eigenlijk","elk","elke","en","enig","enige","enigszins","enkel","er","erdoor","erg","ergens","etc","etcetera","even","eveneens",
    "evenwel","gauw","ge","gedurende","geen","gehad","gekund","geleden","gelijk","gemoeten","gemogen","genoeg","geweest","gewoon","gewoonweg",
    "haar","haarzelf","had","hadden","hare","heb","hebben","hebt","hedden","heeft","heel","hem","hemzelf","hen","het","hetzelfde","hier","hierbeneden",
    "hierboven","hierin","hierna","hierom","hij","hijzelf","hoe","hoewel","honderd","hun","hunne","ieder","iedere","iedereen","iemand","iets","ik","ikzelf",
    "in","inderdaad","inmiddels","intussen","inzake","is","ja","je","jezelf","jij","jijzelf","jou","jouw","jouwe","juist","jullie","kan","klaar","kon",
    "konden","krachtens","kun","kunnen","kunt","laatst","later","liever","lijken","lijkt","maak","maakt","maakte","maakten","maar","mag","maken","me","meer",
    "meest","meestal","men","met","mevr","mezelf","mij","mijn","mijnent","mijner","mijzelf","minder","miss","misschien","missen","mits","mocht","mochten",
    "moest","moesten","moet","moeten","mogen","mr","mrs","mw","na","naar","nadat","nam","namelijk","nee","neem","negen","nemen","nergens","net","niemand",
    "niet","niets","niks","noch","nochtans","nog","nogal","nooit","nu","nv","of","ofschoon","om","omdat","omhoog","omlaag","omstreeks","omtrent","omver",
    "ondanks","onder","ondertussen","ongeveer","ons","onszelf","onze","onzeker","ooit","ook","op","opnieuw","opzij","over","overal","overeind","overige",
    "overigens","paar","pas","per","precies","recent","redelijk","reeds","rond","rondom","samen","sedert","sinds","sindsdien","slechts","sommige","spoedig",
    "steeds","tamelijk","te","tegen","tegenover","tenzij","terwijl","thans","tien","tiende","tijdens","tja","toch","toe","toen","toenmaals","toenmalig",
    "tot","totdat","tussen","twee","tweede","u","uit","uitgezonderd","uw","vaak","vaakwat","van","vanaf","vandaan","vanuit","vanwege","veel","veeleer",
    "veertig","verder","verscheidene","verschillende","vervolgens","via","vier","vierde","vijf","vijfde","vijftig","vol","volgend","volgens","voor",
    "vooraf","vooral","vooralsnog","voorbij","voordat","voordezen","voordien","voorheen","voorop","voorts","vooruit","vrij","vroeg","waar","waarom",
    "waarschijnlijk","wanneer","want","waren","was","wat","we","wederom","weer","weg","wegens","weinig","wel","weldra","welk","welke","werd","werden",
    "werder","wezen","whatever","wie","wiens","wier","wij","wijzelf","wil","wilden","willen","word","worden","wordt","zal","ze","zei","zeker","zelf",
    "zelfde","zelfs","zes","zeven","zich","zichzelf","zij","zijn","zijne","zijzelf","zo","zoals","zodat","zodra","zonder","zou","zouden","zowat","zulk",
    "zulke","zullen","zult", 'vanfaag','usually','mooie','voelde','gebruikte','gebruik','gaat','velen','use','hallo','ging','lang'
]

# Get Dutch and English stopwords
stop_words_dutch = set(get_stopwords("dutch"))
stop_words_english = set(get_stopwords("english"))

# Combine the stopwords for Dutch, English, and the additional custom stopwords
stop_words = stop_words_dutch.union(stop_words_english).union(additional_stopwords)

# Load the interaction logs
with open('interaction_logs.json', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# Combine user queries into a single text corpus
queries = [entry['user_query'] for entry in data]

# Preprocess text: tokenize, remove stopwords, punctuation, and lowercase
processed_queries = []

for query in queries:
    # Convert to lowercase
    query = query.lower()
    
    # Remove punctuation
    query = query.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize (split by whitespace)
    tokens = query.split()
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join the tokens back into a string
    processed_queries.append(" ".join(tokens))

# Vectorize the text for LDA
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(processed_queries)

# Apply LDA for topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # Adjust n_components for more topics
lda.fit(X)

# Display the topics
def display_topics(model, feature_names, no_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx+1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[-no_top_words:]]))
        print("\n")

no_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, no_top_words)


Topic 1:
schilder, technieken, soort, painting, techniques, vroeger, theo, hechte, broer, band


Topic 2:
starry, gewerkt, boodschap, painting, techniques, sterrennacht, soort, onderwerpen, schilderde, liefst


Topic 3:
mauve, haag, den, bezoeken, vertellen, sterrennacht, geschilderd, tevreden, zonnebloemen, schilderij


Topic 4:
schilder, technieken, starry, night, gewerkt, boodschap, painting, techniques, sterrennacht, soort


Topic 5:
soort, vincent, populair, bekendste, kunstwerken, sterrennacht, night, starry, technieken, schilder


