# Latent Dirichlet Allocation for persona modelling

This jupyter notebook creates a LDA model to approximate human personalities. It does so by creating the document-topic matrix and the word distributions for the topics from ParlAI persona descriptions. The idea was that these topics could be used to describe what a person likes, and then to use similar persons to find more topics the person might enjoy, similar to the Netflix challenge, where instead of movies there are topics. 

The form of this notebook is from here: This is from here https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

## 1 - Packages
The needed packages. Read the `readme` of the folder if you have issues with gensim.

In [None]:
# Importing the libraries used. Use the .yml file to create the conda environment. Check the instructions
# for how to create a new kernel choice for gensim to make it work

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import numpy as np
np.random.seed(2018)

from surprise import SVD
from surprise.model_selection import cross_validate

from collections import Counter
import pandas
import matplotlib

from pprint import pprint

import string

## 2 - Functions
Here the relevant functions are defined. Can then be used to any text.

In [None]:
# Function to import the desired corpus, and normalize it (all lower case, small lines removed, no punctuation)
def import_text(filename):
    documents = []
    with open(filename, "r") as source:
        for line in source:
            if len(line) < 3:
                continue
            line = line.lower()
            line = line.translate(str.maketrans('', '', string.punctuation))
            documents.append(line)
    return documents

In [None]:
# Function to further preprocess the text for NLP (remove stopwords and small words)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(token)
    return result    

In [None]:
# Function to generate the document-topic matrix. Dimensions will be #documents x #topics
# Relevant later
def get_doc_topic(corpus, model):
    doc_topic_pair = list()
    for doc in corpus:
        doc_topic_pair.append(model.__getitem__(doc, eps=0))
    doc_topic = list()
    for doc in doc_topic_pair:
        doc_prob = list()
        for i, prob in doc:
            doc_prob.append(prob)
        doc_topic.append(doc_prob)
    return doc_topic 

In [None]:
# This function uses the LDA model to generate a new sentence BOW style from a BOW sentence given to it.
# At the moment very inefficient
def generate_sentence_from_bow(bow, model, topic_words, dictionary):
    gen_sentence = []
    i = len(bow)
    topics = model.get_document_topics(bow) # Might have to use the whole 400 lenght vector instead
    while len(gen_sentence) < i:
        for topic_idx, topic_prob in topics:
            if topic_prob > np.random.rand():
                for idx, word_prob in enumerate(topic_words[topic_idx]):
                    if word_prob > np.random.rand():
                        gen_sentence.append(dictionary.get(idx))
                        break
                break
    return gen_sentence        

## 3 - Preprocess
All the steps before generating the actual LDA model.

In [None]:
# This cell imports the text
documents = import_text("personas_with_id_train_both_all.txt")

# Preprocess all the docs
processed_docs = list(map(preprocess, documents))

# No printing of stuff

In [None]:
do_print = True

In [None]:
# NOT IMPORTANT CELL. Just to print and see stuff worked
if do_print:
    print(len(documents))
    print(documents[5:10])

    doc_sample = documents[101]
    print('original document: ')
    words = []
    for word in doc_sample.split(' '):
        words.append(word)
    print(words)
    print('\n\n tokenized document: ')
    print(preprocess(doc_sample))
    
    print('\n\n First ten tokenized document: ')
    print(processed_docs[:10])

In [None]:
# Create gensim dictionary object to change documents for desired shape by the LDA
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
if do_print:
    count = 0

    for k, v in dictionary.iteritems():
        print(k, v)
        count += 1
        if count > 6:
            break

In [None]:
# Filtering the extreme instances (the persona texts have a lot of like words, those are not needed for this)
dictionary.filter_extremes(no_above=0.3)

In [None]:
# Creating bag of words from the documents (LDA wants the corpus as a BOW)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
if do_print:
    ex = 1675
    print(bow_corpus[ex])
    print(processed_docs[ex])
    print(dictionary)

In [None]:
# TFIDF apparently can also be used to create LDA model.
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
if do_print:
    pprint(corpus_tfidf[ex])

## 4 - Training the models
Actually training the models, both with BOW and TF-IDF. Doesn't seem to be much of a difference even though BOW is the I guess preferred way to do it.

In [None]:
# Training the LDA model with the BOW corpus. Number of topics is a very relevant hyperparameter, passes probably less so
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus, num_topics=10, id2word=dictionary, passes=2)

In [None]:
if do_print:
    pprint(lda_model.print_topics())

In [None]:
# Training the LDA model with the TF-IDF corpus
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2)

In [None]:
if do_print:
    lda_model_tfidf.print_topics()
    pprint(lda_model.get_document_topics(bow_corpus[ex]))

In [None]:
if do_print:
    test_sentence1 = "I like to cook. I also watch the night sky. I don't like horseback riding"
    test_sentence2 = "i work with computers. i love drinking coffee in the morning. i brew beer"
    test_bow1 = dictionary.doc2bow(preprocess(test_sentence1))
    print(test_bow1)
    sent1_topics = lda_model.get_document_topics(test_bow1)
    pprint(sent1_topics)

    test_bow2 = dictionary.doc2bow(preprocess(test_sentence2))
    sent2_topics = lda_model.get_document_topics(test_bow2)
    pprint(sent2_topics)

In [None]:
run_barnados = False
if run_barnados:
    barnardos = import_text("barnardo_lines.txt")
    processed_barnardos = list(map(preprocess, barnardos))
    i = 0
    for bar in processed_barnardos:
        print(i)
        print(lda_model.get_document_topics(dictionary.doc2bow(bar)))
        i += 1

## 5 - Creating matrices
Here the relevant matrices are created. Mainly document - topic and topic - word distribution.

In [None]:
# Uses the gensim function directly to create the topic-word distribution matrix. 
# Will be form topic (int) x dictionary (float) where float will be the probability of that word.
topic_word_matrix = lda_model.get_topics()

In [None]:
if do_print:
    print(topic_word_matrix.shape)
    print(topic_word_matrix[3][:5])

In [None]:
# Here the function defined at the topic is used. Generates a matrix of the form document x topic
document_topic_matrix = get_doc_topic(bow_corpus, lda_model)

In [None]:
# To numpy format, easier to handle and do stuff to later
document_topic_matrix = np.asarray(document_topic_matrix, dtype=np.float64)

In [None]:
if do_print:
    print(document_topic_matrix.shape)
    print(document_topic_matrix[ex][241:246])
    print(sum(document_topic_matrix[23]))
    print(sum(document_topic_matrix[123]))
    print(sum(document_topic_matrix[2343]))
    print(sum(topic_word_matrix[3]))
    print(sum(topic_word_matrix[7]))
    print(sum(topic_word_matrix[8]))

In [None]:
# Changing the matrix into better form for the FUNK SVD, so values close to zero will be zero.
# This should make it so the SVD generates recommendations for those values.
thresh = 0.1
super_threshold_indices = document_topic_matrix < thresh
document_topic_matrix[super_threshold_indices] = 0

In [None]:
if do_print:
    print(document_topic_matrix[ex][90:97])
    print(len(bow_corpus[ex]))
    topic_vector = lda_model.get_topic_terms(3)
    print(len(topic_vector))
    print(dictionary.doc2idx(["educated", "highly", "dog"]))
    print(dictionary.get(2470))
    print(dictionary.get(56))
    print(dictionary.get(1))

In [None]:
if do_print:
    generated_test = generate_sentence_from_bow(test_bow1, lda_model, topic_word_matrix, dictionary)

In [None]:
if do_print:
    print(test_bow1)
    print(lda_model.get_document_topics(test_bow1))
    print(generated_test)

## 6 - Testing with Shakespear
The latter section is basically testing the same thing with shakespear since it is smaller but still has lots of text for characters for persona modelling

In [None]:
# All the previous relevant steps but with shakespear
shakespear = import_text("formatted_hamlet.txt")
processed_spear = list(map(preprocess, shakespear))

dictionary_spear = gensim.corpora.Dictionary(processed_spear)
#dictionary_spear.filter_extremes(no_above=0.3)

bow_spear = [dictionary_spear.doc2bow(doc) for doc in processed_spear]

spear_model = gensim.models.ldamodel.LdaModel(corpus=bow_spear, num_topics=40, id2word=dictionary_spear, passes=2)
spear_tw_matrix = spear_model.get_topics()

In [None]:
# Generating the whole play with the LDA model
generated_corpus = []
for bow in bow_spear:
    generated_corpus.append(generate_sentence_from_bow(bow, spear_model, spear_tw_matrix, dictionary_spear))

In [None]:
if do_print:
    sp_ex = 45
    print(generated_corpus[sp_ex])
    print(bow_spear[sp_ex])
    print(processed_spear[sp_ex])

In [None]:
# Counting the word instances with Counter for histagram (or actually a bar graph)
lda_spear_cnt = Counter()
proc_spear_cnt = Counter()

for sentence in generated_corpus:
    for word in sentence:
        lda_spear_cnt[word] += 1

for sentence in processed_spear:
    for word in sentence:
        proc_spear_cnt[word] += 1
lda_most = lda_spear_cnt.most_common(25)
org_most = proc_spear_cnt.most_common(25)

In [None]:
if do_print:
    print("lda")
    print(lda_most)
    print("org")
    print(org_most)

In [None]:
# Counter results to dictionary for pandas
lda_most_dict = dict(lda_most)

# How many times the most common words in lda are in the org
org_dict = {}
for key in lda_most_dict:
    org_dict[key] = proc_spear_cnt[key]


In [None]:
if do_print:
    df1 = pandas.DataFrame.from_dict(lda_most_dict, orient='index')
    ax1 = df1.plot(kind='bar')
    ax1.set_ylim(0,220)
    matplotlib.pyplot.show()

In [None]:
if do_print:
    df2 = pandas.DataFrame.from_dict(org_dict, orient='index')
    ax2 = df2.plot(kind='bar')
    ax2.set_ylim(0,220)
    matplotlib.pyplot.show()

## 7 - Replace persona descriptions with a topic description (WIP)

In this section the previously trained LDA model is going to be used to generate topics from training files 4 persona lines, and then replacing those lines with the topic list.

In [None]:
fromFile = "sample.txt"
toFile = "topicAndDescription.txt"


In [None]:
def extractTopicsFromPersonaLines(personaLines):
    personaLines = personaLines.translate(str.maketrans('', '', string.punctuation))
    personaLines = personaLines.rstrip()
    preprocessedPersonaLines = preprocess(personaLines)
                
    bowPersonaLines = dictionary.doc2bow(preprocessedPersonaLines)
    
    personaDescriptionInWords = []
    topicsFromPersonaLines = lda_model.get_document_topics(bowPersonaLines)
    for topic, weight in topicsFromPersonaLines:
        showTopicResults = lda_model.show_topic(topic)
        for word, percentage in showTopicResults:
            personaDescriptionInWords.append(word)
    stringPersonaDescriptionInWords = " ".join(personaDescriptionInWords)
    personaAsTopicFile.write("topics: " + stringPersonaDescriptionInWords + "\n")


In [None]:

####################
### TODO ###########
####################
# Line numbering does not work properly. If writePersonaLinesAlso is false
# then line numbers skip. If true the topicLine is kind of an extra.

print("running..")
personaLines = ""
yourPersonaLinesCounter = 0
partnersPersonaLinesCounter = 0
writePersonaLinesAlso = True

with open(fromFile, 'r') as personachatFile,\
    open(toFile, 'w') as personaAsTopicFile:
    for line in personachatFile: 

            if "your persona:" in line and partnersPersonaLinesCounter == 0:
                currentPersonaLine = line.split(": ")[1].rstrip()
                personaLines += currentPersonaLine + " "
                yourPersonaLinesCounter += 1
                if writePersonaLinesAlso:
                    personaAsTopicFile.write(line)
            
            elif "partner's persona:" in line and yourPersonaLinesCounter == 0:
                currentPersonaLine = line.split(": ")[1].rstrip()
                personaLines += currentPersonaLine + " "
                partnersPersonaLinesCounter += 1
                if writePersonaLinesAlso:
                    personaAsTopicFile.write(line)
                    
            elif (("partner's persona:" in line and yourPersonaLinesCounter != 0) or
                ("your persona:" in line and partnersPersonaLinesCounter != 0)):
                
                extractTopicsFromPersonaLines(personaLines)
                
                if yourPersonaLinesCounter != 0:
                    yourPersonaLinesCounter = 0
                    partnersPersonaLinesCounter += 1
                else:
                    partnersPersonaLinesCounter = 0
                    yourPersonaLinesCounter += 1
                personaLines = ""
                
                currentPersonaLine = line.split(": ")[1].rstrip()
                personaLines += currentPersonaLine + " "
                if writePersonaLinesAlso:
                    personaAsTopicFile.write(line)
            elif ("\t" in line and 
                    (yourPersonaLinesCounter != 0 or 
                    partnersPersonaLinesCounter != 0)):
                
                extractTopicsFromPersonaLines(personaLines)
                personaLines = ""
                yourPersonaLinesCounter = 0
                partnersPersonaLinesCounter = 0
                personaAsTopicFile.write(line)
            else:
                personaAsTopicFile.write(line)

                

In [None]:
[wordvalue[0] for wordvalue in lda_model.show_topic(8)]
lda_model.get_topic_terms(8)
a1 = lda_model.get_document_topics(bow_corpus[125])
dictionary.get(56)
for topic, weight in a1:
    print(topic)
a1