In [1]:
!pip install nltk



In [2]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jrubi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jrubi\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jrubi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
# nltk.download('stopwords')
# Path in Collab: /root/nltk_data/corpora/stopwords/english

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#P1 Utility

""" File:  P1_utility_functions.py
    Utility functions used in Phase 1 Pre-processing of the raw chat (.csv) file
"""
import os.path
import re
import nltk
import gensim
from nltk.stem import WordNetLemmatizer


def getStopWords(stopWordFileName):
    """Reads stop-words text file which is assumed to have one word per line.
       Returns stopWordDict.
    """
    stopWordDict = {}
    stopWordFile = open(stopWordFileName, 'r')

    for line in stopWordFile:
        word = line.strip().lower()
        stopWordDict[word] = None
        
    return stopWordDict

def getFileName(prompt):
    """Prompts the user for a valid file which it returns.
    """
    while True:
        fileName = input(prompt+" ")
        if os.path.exists(fileName):
            return fileName
        else:
            print("File not found! Make sure that the file is inside this directory.")

def readRawChats(inFile):
    """
        Reads .csv file and split into transcripts by splitting on the Timestamp which includes the Date.
        The returned transcriptList is a list-of-lists where each "outer" list item contains information about
        a single chat.  
    """

    inFile = open(inFile, "r")  # NOTE .csv file assumed to have column-headings line

    dateAtStartCount = 0
    transcriptList = []
    currentTranscriptLines = []

    for line in inFile:
        frontOfLine = line[:6]
        if frontOfLine.count("/") == 2:
            dateAtStartCount += 1
            if dateAtStartCount == 1: #ignore header line
                currentTranscriptLines = [line.strip()]
            else:
                transcriptList.append(currentTranscriptLines)
                currentTranscriptLines = [line.strip()]
        else:
            currentTranscriptLines.append(line.strip())
    transcriptList.append(currentTranscriptLines)
    
    return transcriptList


def findInitialQuestion(transList, transIndex):
    """
        Takes in transList which is a list of strings containing the information about a single chat.
        The index 0 string will contain the Initial Question field, which it returns if it exists; otherwise
        None is returned."
    """
    
    firstCommaIndex = transList[0].find(",")
    if firstCommaIndex == -1:
        print("First comma not found")
        return None
    else:
        secondCommaIndex = transList[0].find(",",firstCommaIndex+1)
        if secondCommaIndex == -1:
            print("Second comma not found")
            return None
        else:
            thirdCommaIndex = transList[0].find(",",secondCommaIndex+1)
            if thirdCommaIndex == -1:
                thirdCommaIndex = len(transList[0])-1
           
            #print(secondCommaIndex, thirdCommaIndex)
            if secondCommaIndex + 1 == thirdCommaIndex:
                return None
            else:
                return transList[0][secondCommaIndex+1:thirdCommaIndex]

            
def generateTranscriptDialogList(trans):
    
    transcriptDialogList = []
    transStr = " ".join(trans)  # merge transcript back to a single string

    #split by time-stamps to get a dialogList
    transTimeIndexList = []
    for index in range(2,len(transStr)-6):
        if transStr[index] == ":" and transStr[index+3] == ":" and transStr[index+1:index+3].isdigit() and transStr[index+4:index+6].isdigit():
            transTimeIndexList.append(index-2)
    dialogList = []
    for i in range(len(transTimeIndexList)-1):
        dialogList.append(transStr[transTimeIndexList[i]:transTimeIndexList[i+1]])
    if len(transTimeIndexList) == 0:
        dialogList.append(transStr)
    else:
        dialogList.append(transStr[transTimeIndexList[-1]:])
    
    return dialogList    

def findInitialQuestionInDialog(dialogList, chatIndex):
    """ If the 'Initial question' column in the .csv file was empty, this function is called
        to find and return the initial question from the chat dialog."""

    for i in range(len(dialogList)):
        helpYouCount = dialogList[i].lower().count("help you")
        welcomeCount = dialogList[i].lower().count("welcome")
        infoDeskCount = dialogList[i].lower().count("info desk")
        try:
            if helpYouCount == 0 and welcomeCount == 0 and infoDeskCount == 0 and len(dialogList[i]) >= 40:
                return dialogList[i]
                
        except:
            print("\n\nNO QUESTION FOUND! ",chatIndex)
            break

def removeTags(fileStr):
    """
        Removes all tags from the chat that start with '<xyz' and end with '</xyz'.
    """
    current = 0
    while True:
        #print("Next char:",fileStr[current])
        openAngleBracketIndex = fileStr.find('<',current)
        if openAngleBracketIndex == -1:
            break
        spaceIndex = fileStr.find(' ', openAngleBracketIndex+1)
        if spaceIndex == -1:
            break
        else:
            current = spaceIndex
        endStr = "</"+fileStr[openAngleBracketIndex+1:spaceIndex]+'>'

        endIndex = fileStr.find(endStr, spaceIndex)
        if endIndex == -1:
            current = spaceIndex
        else:
            endIndex = endIndex+len(endStr)

            #print(openAngleBracketIndex, endStr, endIndex+len(endStr))
            fileStr = fileStr[:openAngleBracketIndex]+ \
                      fileStr[endIndex:]
            #print(fileStr)
            current = openAngleBracketIndex
    return fileStr


"""
NOTE: The nltk.pos_tag function returns the Penn Treebank tag for the word but we just want
whether the word is a noun, verb, adjective or adverb. We need a short simplification routine to translate from
the Penn tag to a simpler tag.
"""
def simplify(penn_tag):
    """ Simplify Penn tags to n (NOUN), v (VERB), a (ADJECTIVE) or r (ADVERB)"""
    pre = penn_tag[0]
    
    if pre == 'J':
        return 'a'
    elif pre == 'R':
        return 'r'
    elif pre == 'V':
        return 'v'
    elif pre == 'N':
        return 'n'
    else:
        return 'r'
        return 'other'

def preprocess(text, stop_words, POS_list):
    """ Preprocesses the text to remove stopwords, lemmatizes each word and only includes
        words that are POS in the global POS_LIST"""

    toks = gensim.utils.simple_preprocess(str(text), deacc=True)
    wn = WordNetLemmatizer()
    return [wn.lemmatize(tok, simplify(pos)) for tok, pos in nltk.pos_tag(toks)
            if tok not in stop_words and simplify(pos) in POS_list]
        
def writeInitialQuestion(chatIndexInCSV, questionFile,  wholeChatsFileTxt, question, questionCount, stopWordsDict, POS_list):
    """ Write a cleaned up version of the initial question to the question file. """
    lemmatizer = nltk.WordNetLemmatizer()
    cleanQuestion = ""
    question = question.lower()

    colonCount = question.count(":")

    if colonCount >= 3:  # time-stamp ##:##:## - person: question
        colonOneIndex = question.find(":")
        colonTwoIndex = question.find(":", colonOneIndex+1)
        colonThreeIndex = question.find(":", colonTwoIndex+1)
        question = question[colonThreeIndex+1:]
    elif colonCount >= 1:
        colonOneIndex = question.find(":")
        question = question[colonOneIndex+1:]
        
    question = question.replace('&#x27;', "'")
    question = question.replace('&#x2F;', " ")
    question = question.replace('&nbsp;', " ")
    question = question.replace('&quot;','"')

    ### HERE CLEAN UP <xyz ......</xyz>, e.g., <a href.....</a>, <span ... </span>

    question = removeTags(question)
    question = question.replace('.','Z')
    question = question.replace('!','Z')
    question = question.replace('?','Z')
    
    masterWordList = []
    sentenceList = question.split("Z")
    for question in sentenceList:
        wordList = question.split()
        cleanQuestion = ""
        for word in wordList:
            cleanWord = ""
            for char in word:
                if char >= 'a' and char <= 'z':
                    cleanWord += char
            if len(cleanWord) > 0 and len(cleanWord) < 30:  #upper bound to eliminate url's
                cleanQuestion += lemmatizer.lemmatize(cleanWord) + " "
        pos_wordList = preprocess(cleanQuestion, stopWordsDict, POS_list)
          
        masterWordList.extend(pos_wordList)

    chatCleaned = " ".join(masterWordList)
    if len(chatCleaned) > 0:
        questionFile.write(chatCleaned)
        wholeChatsFileTxt.write(chatCleaned)
        questionCount += 1
    return questionCount

def writeChatDialog(excelLineNumber, wholeChatsFile,  wholeChatsFileTxt, dialogList, stopWordsDict, POS_list):
    """ Writes a chat's dialog to a line in the text file. """
    for i in range(len(dialogList)):
      
        writeInitialQuestion(excelLineNumber, wholeChatsFile,  wholeChatsFileTxt, dialogList[i], 0, stopWordsDict, POS_list)
        wholeChatsFile.write(" ")  # separate end of this line with start of next line
        wholeChatsFileTxt.write(" ")  # separate end of this line with start of next line
        
   
def writeWholeChatsToFile(transcriptDialogList, dataFileName, stopWordsDict, POS_list):
    """ Writes a whole chat's dialog one per line to a text file.  Removed from
        the line of text is:
        1) time-stamps and names:  e.g., '13:45:42 - Jordan:'
        2) all punctuations
    """

    wholeChatsFile = open(dataFileName+".csv", "w")
    wholeChatsFileTxt = open(dataFileName+".txt", "w")
    wholeChatsCount = 0
    for transcriptDialog in transcriptDialogList:

        if transcriptDialog[1] is not None:
            wholeChatsFile.write(str(transcriptDialog[0])+",")

            # check to see if initial question is already in the chat dialog
            timeStampAndNameList = re.findall(r'[0-9][0-9]:[0-9][0-9]:[0-9][0-9] - [\w\s]+:', transcriptDialog[1])
            
            if len(timeStampAndNameList) == 0:  # no time-stamp so from 'initial question' column of .csv
                # write initial question to file since it is not part of the chat dialog
                writeInitialQuestion(transcriptDialog[0], wholeChatsFile, wholeChatsFileTxt, transcriptDialog[1], 0, stopWordsDict, POS_list)
                wholeChatsFile.write(" ")
                wholeChatsFileTxt.write(" ")
            writeChatDialog(transcriptDialog[0],wholeChatsFile,  wholeChatsFileTxt, transcriptDialog[2], stopWordsDict, POS_list)
            
            #wholeChatsFile.write("\n")
            wholeChatsCount += 1
            wholeChatsFile.write("\n")
            wholeChatsFileTxt.write("\n")
    print("Whole Chats Count:", wholeChatsCount, "written to",dataFileName+".txt")
    wholeChatsFile.close()
    wholeChatsFileTxt.close()

def writeQuestionsOnlyToFile(transcriptDialogList, dataFileName, stopWordsDict, POS_list):
    """ Writes only the initial questions one per line to a text file. 
    """
    questionFile = open(dataFileName+".csv", "w")
    questionTxtFile = open(dataFileName+".txt", "w")
    questionCount = 0
    for transcriptDialog in transcriptDialogList:
        if transcriptDialog[1] is not None:
            currentCount = questionCount
            questionCount = writeInitialQuestion(transcriptDialog[0], questionFile, questionTxtFile, transcriptDialog[1], questionCount, stopWordsDict, POS_list)
            if currentCount < questionCount:
                questionFile.write("\n")
                questionTxtFile.write("\n")
    print("Total Question Count:", questionCount, "written to",dataFileName+".txt")
    questionFile.close()
    questionTxtFile.close()


ModuleNotFoundError: No module named 'gensim'

In [9]:
#P1 Script

""" File:  P1_preprocess_data.py
    Description:  Takes as input raw chat data .csv file from the LibChat keeping only the columns:
    Timestamp, Duration (seconds), Initial Question, Message Count, and Transcript

    Additionally the chat text data is "cleaned" by: 
    1) removing timestamps, 
    2) removing chat patron and librarian identifiers, 
    3) removing http tags (e.g., URLs), 
    4) removing non-ASCII characters,
    5) removing stopwords, and 
    6) lemmatized words using nltk.WordNetLemmatizer() 

    Four data-set versions of the “cleaned” chat transcripts were prepared:
    1) "onlyQuestionsFile.txt" - Questions only: consists of only the initial question asked by
        the library patron in each chat transcript
    2) "wholeChatsFile.txt" - Whole chats: consists of the whole cleaned chat transcripts
    3) "wholeChatsFilePOS_N_ADJ.txt" - Whole chats with POS (Noun and Adjective): consists of only
       the nouns and adjectives parts-of-speech (POS) from the whole cleaned chat transcripts
    4) "wholeChatsFilePOS_N_ADJ_V.txt" - Whole chats with POS (Noun, Adjective, and Verb): consists
       of only the nouns, adjectives, and verbs parts-of-speech (POS) from the whole cleaned chat transcripts
    The goal of the first two data sets was to see if looking at only the initial question in the
    chats was better than the whole chats. The goal of the last two data sets was to see if varying
    the parts-of-speech retained had any effect on the topic modeling analyses. 

    Takes as input raw chat data .csv file and produces a list-of-lists called transcriptDialogList with a format:
    [[<excel index int>, "Initial question string", [Transcript split by chat responses which including initial
    question]], ...]. This transcriptDialogList is used to write two text files for each of the four
    data-set versions .  Each chat dialog is used to produce one line in the two text files:
    1) the .csv file is formated with one chat per line formatted as:
       chat line # in original .csv, cleaned and pre-processed text of the chat, and
    2) the .txt file is cleaned and pre-processed text of the chat

"""
import nltk

#from P1_utility_functions import *

def main():
    print('Welcome to Phase 1 of the chat analysis which pre-processes a raw chat data .csv file',
          '\nfrom the LibChat keeping only the 5 columns (with column-headings):',
          '\nTimestamp, Duration (seconds), Initial Question, Message Count, and Transcript.',
          '\n\nRunning Phase 1 to pre-process your raw chat data (.csv) will generate four cleaned chat',
          '\nfiles varying the parts of speech or question-only.',
          '\n1) "onlyQuestionsFile.txt" - consists of only the initial questions asked by the library patrons',
          '\n2) "wholeChatsFile.txt" - consists of the whole cleaned chat transcripts',
          '\n3) "wholeChatsFilePOS_N_ADJ.txt" - consists of only the nouns and adjectives parts-of-speech (POS)',
          '\n4) "wholeChatsFilePOS_N_ADJ_V.txt" - consists of only the nouns, adjectives, and verbs parts-of-speech\n')

    prompt = "\nStep 1. Please input the raw LibChat (.csv) file." + \
             '\n(For example: "chatFile.csv"):'
    inputCSVFileName = getFileName(prompt)

    prompt = "\nStep 2. Please input the stop words (.txt) file." + \
             '\n(For example: "stop_words.txt"):'
    stopWordFileName = getFileName(prompt)

    print("\n\nWARNING:  Depending on the size of your chat data file.  This step might take several minutes.")

    POS_list = ['n','a','v','r']  # n - noun and a - adjective other possibilities: v -verb, r - adverb, 'other'

    stopWordsDict = getStopWords(stopWordFileName)
    transcriptList = readRawChats(inputCSVFileName)

    initialQuestionCount = 0
    transIndex = 2  # Assumes Excel .cvs had a column-header in line 1
    transcriptDialogList = []
    for trans in transcriptList:
        transDialogList = generateTranscriptDialogList(trans)
        initialQuestion = findInitialQuestion(trans, transIndex)
        if initialQuestion == None:
            initialQuestion = findInitialQuestionInDialog(transDialogList,transIndex)           
        else:
            initialQuestionCount+= 1
            
        transcriptDialogList.append([transIndex, initialQuestion, transDialogList])
        transIndex += 1

    print("Number of initial questions from Initial Question column of .csv:", initialQuestionCount)

    POS_list = ['n','a','v','r','other']  # n - noun and a - adjective other possibilities: v -verb, r - adverb, 'other'
    writeQuestionsOnlyToFile(transcriptDialogList, "onlyQuestionsFile", stopWordsDict, POS_list)

    writeWholeChatsToFile(transcriptDialogList, "wholeChatsFile", stopWordsDict, POS_list)

    POS_list = ['n','a']  # n - noun and a - adjective other possibilities: v -verb, r - adverb, 'other'
    writeWholeChatsToFile(transcriptDialogList, "wholeChatsFilePOS_N_ADJ", stopWordsDict, POS_list)

    POS_list = ['n','a','v']  # n - noun and a - adjective other possibilities: v -verb, r - adverb, 'other'
    writeWholeChatsToFile(transcriptDialogList, "wholeChatsFilePOS_N_ADJ_V", stopWordsDict, POS_list)
    
    return transcriptDialogList
    
t = main()  # start main running

Welcome to Phase 1 of the chat analysis which pre-processes a raw chat data .csv file 
from the LibChat keeping only the 5 columns (with column-headings): 
Timestamp, Duration (seconds), Initial Question, Message Count, and Transcript. 

Running Phase 1 to pre-process your raw chat data (.csv) will generate four cleaned chat 
files varying the parts of speech or question-only. 
1) "onlyQuestionsFile.txt" - consists of only the initial questions asked by the library patrons 
2) "wholeChatsFile.txt" - consists of the whole cleaned chat transcripts 
3) "wholeChatsFilePOS_N_ADJ.txt" - consists of only the nouns and adjectives parts-of-speech (POS) 
4) "wholeChatsFilePOS_N_ADJ_V.txt" - consists of only the nouns, adjectives, and verbs parts-of-speech


Step 1. Please input the raw LibChat (.csv) file.
(For example: "chatFile.csv"): /content/LibChat.csv

Step 2. Please input the stop words (.txt) file.
(For example: "stop_words.txt"): /root/nltk_data/corpora/stopwords/english


Number of ini

In [22]:
""" File:  P2_utility_functions.py
    Unility functions for Phase 2 which performs the topic modeling.

    Latent Dirichlet Allocation (LDA), PyMallet
    Here we are used the LDA implementation from GitHub PyMallet at:
    https://github.com/mimno/PyMallet
    The LDA code below is based on their lda_reference.py code written in Python
    The PyMallet project has an MIT License see below.
================================================================================
MIT License

Copyright (c) 2019 mimno

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
===========================================================================
"""
from pprint import pprint  # pretty-printer
from collections import defaultdict
from gensim import corpora
from six import iteritems

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
import os.path

from sklearn.pipeline import Pipeline
from time import time


def getStopWords(stopWordFileName):
    """Reads stop-words text file which is assumed to have one word per line.
       Returns stopWordDict.
    """
    stopWordDict = {}
    stopWordFile = open(stopWordFileName, 'r')

    for line in stopWordFile:
        word = line.strip().lower()
        stopWordDict[word] = None
    stopWordSet = set(stopWordDict)
        
    return stopWordDict, stopWordSet

def getPositiveInteger(prompt):
    """Prompts the user for a valid positive integer which it returns.
    """
    while True:
        inputStr = input(prompt+" ")
        try:
            intValue = int(inputStr)
            if intValue <= 0:
                print("Please enter a positive integer.")
                raise ValueError("positive integer only")
            return intValue
        except:
            print("Invalid positive integer")

def getFileName(prompt):
    """Prompts the user for a valid file which it returns.
    """
    while True:
        fileName = input(prompt+" ")
        if os.path.exists(fileName):
            return fileName
        else:
            print("File not found! Make sure that the file is inside this directory.")

# used in LDA sklearn
def readChatCorpusFile(chatFileName):
    """ Read specified chat corpus file (which should be a preprocessed
        text file with one chat per line) and returns the documents list
        where each chat being a string in the list.
    """
    documentsFile = open(chatFileName, 'r')

    documentsList = []
    for documentLine in documentsFile:
        documentLine = documentLine.lower()
        if len(documentLine) > 0:
            documentsList.append(documentLine)
    #print("len(documents)",len(documentsList))
    return documentsList

def print_top_words(model, feature_names, n_top_words):
    """ Displays the specified top topics and top words to screen"""
    for topic_idx, topic in enumerate(model.components_):
        
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        message += "\n "
        message += " ".join([feature_names[i]+" ("+str(model.components_[topic_idx][i])+")\n"
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def write_file_top_words(model, feature_names, n_top_words, fileName):
    """ Writes the specified top topics and top words to the specified fileName"""
    outputFile = open("raw_"+fileName, 'w')
    outputFileTopics = open(fileName, 'w')
    outputFile.write("File: "+"raw_"+fileName+"\n\n")
    outputFileTopics.write("File: "+fileName+"\n\n")
    topicList = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        topicStr = " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topicList.append(topicStr)
        outputFileTopics.write(topicStr+"\n")

        message += topicStr + "\n "
        message += " ".join([feature_names[i]+" ("+str(model.components_[topic_idx][i])+")\n"
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        outputFile.write(message+"\n")

    outputFile.close()
    outputFileTopics.close()
    return topicList

# used in LSA with gensim
def write_LSA(model_lsi, n_topics, n_words_per_topic, fileName):
    """ Writes the specified top topics and top words to the specified fileName"""
    topicList = model_lsi.print_topics(n_topics)
    ##print("topicList",topicList)
    corpus_tfidf_and_lsa_fileName = "raw_" + fileName
    corpus_tfidf_and_lsa_file = open(corpus_tfidf_and_lsa_fileName, 'w')
    outputFile = open(fileName,'w')
    corpus_tfidf_and_lsa_file.write("File: " + corpus_tfidf_and_lsa_fileName + '\n\n')
    outputFile.write("File: " + fileName + '\n\n')
    listOfTopics = []
    for topicIndex in range(n_topics):
        corpus_tfidf_and_lsa_file.write(str(topicList[topicIndex])+'\n')
        line = str(topicList[topicIndex])
        topicString = ""
        startIndex = 0
        for count in range(n_words_per_topic):
            wordStart = line.find('*"', startIndex) + 2
            wordEnd = line.find('"', wordStart) - 1
            topicString += line[wordStart:wordEnd+1] + " "
            startIndex = wordEnd + 1
        outputFile.write(topicString+"\n")
        listOfTopics.append(topicString)

    outputFile.close()
    corpus_tfidf_and_lsa_file.close()
    return listOfTopics


# create a vector stream to avoid loading the whole vector into memory at one time
class MyCorpus(object):
    def __init__(self, documentsList, dictionary):
        self._docsList = documentsList
        self.myDictionary = dictionary
        
    def __iter__(self):
#        for line in open(FILE_NAME_OF_CORPUS+'_lemmatized.txt'):
        for line in self._docsList:
            # assume there's one document per line, tokens separated by whitespace
            yield self.myDictionary.doc2bow(line.lower().split())


def createCorpusDictionary(documentsList, stoplist):
    # collect statistics about all tokens, i.e., words
    dictionary = corpora.Dictionary(line.lower().split() for line in documentsList)
    # remove stop words and words that appear only once
##    stop_ids = [
##        dictionary.token2id[stopword]
##        for stopword in stoplist
##        if stopword in dictionary.token2id
##    ]
    once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
##    dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
    dictionary.filter_tokens(once_ids)  # remove words that appear only once
    dictionary.compactify()  # remove gaps in id sequence after words that were removed

    #dictionary.save(FILE_NAME_OF_CORPUS+'.dict')  # store the dictionary, for future reference
    corpus_memory_friendly = MyCorpus(documentsList,dictionary)  # doesn't load the corpus into memory!

    #corpora.MmCorpus.serialize(FILE_NAME_OF_CORPUS+'.mm', corpus_memory_friendly)
    return dictionary, corpus_memory_friendly

## functions used in PyMallet_LDA
""" 
    Here we are used the LDA implementation from GitHub PyMallet at:
    https://github.com/mimno/PyMallet
    The LDA code below is based on their lda_reference.py code written in Python
    The PyMallet project has an MIT License see below.

    INPUT FILES:
    Previously created preprocessed chat corpus from either:
    1) wholeChatsFilePOS_N_ADJ_V.csv -- preprocessing keeping nouns, adjectives, and verbs
    2) wholeChatsFilePOS_N_ADJ.csv -- preprocessing keeping nouns and adjectives
    3) wholeChatsFile.csv -- NO POS preprocessing so all parts of speech
    4) onlyQuestionsFile.csv -- Only initial question of chats

    OUTPUT FILES:
    1) "raw_" text (.txt) file listing topics with each word scored
    2) "PyMallet_LDA_" text (.txt) file containing only the text for the
       specified number of topics with the specified number of words per topic

MIT License

Copyright (c) 2019 mimno

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

"""

import re, sys, random, math
import numpy as np
from collections import Counter
from timeit import default_timer as timer

from time import time

def sample(documents, vocabulary_size, word_topics, topic_totals, word_counts, num_iterations, n_topics, doc_smoothing = 0.5, word_smoothing = 0.01):
    smoothing_times_vocab_size = word_smoothing * vocabulary_size

    word_pattern = re.compile("\w[\w\-\']*\w|\w")

    for iteration in range(num_iterations):
        
        for document in documents:
            
            doc_topic_counts = document["topic_counts"]
            token_topics = document["token_topics"]
            doc_length = len(token_topics)
            for token_topic in token_topics:
                
                w = token_topic["word"]
                old_topic = token_topic["topic"]
                word_topic_counts = word_topics[w]
                
                ## erase the effect of this token
                word_topic_counts[old_topic] -= 1
                topic_totals[old_topic] -= 1
                doc_topic_counts[old_topic] -= 1
                
                ###
                ### SAMPLING DISTRIBUTION
                ###
                
                ## Does this topic occur often in the document?
                topic_probs = (doc_topic_counts + doc_smoothing) / (doc_length + n_topics * doc_smoothing)
                ## Does this word occur often in the topic?
                topic_probs *= (word_topic_counts + word_smoothing) / (topic_totals + smoothing_times_vocab_size)
                
                ## sample from an array that doesn't sum to 1.0
                sample = random.uniform(0, np.sum(topic_probs))
                
                new_topic = 0
                while sample > topic_probs[new_topic]:
                    sample -= topic_probs[new_topic]
                    new_topic += 1
                
                ## add back in the effect of this token
                word_topic_counts[new_topic] += 1
                topic_totals[new_topic] += 1
                doc_topic_counts[new_topic] += 1
                
                token_topic["topic"] = new_topic               

def entropy(p):
    ## make sure the vector is a valid probability distribution
    p = p / np.sum(p)
    
    result = 0.0
    for x in p:
        if x > 0.0:
            result += -x * math.log2(x)
            
    return result

def print_topic(topic):
    sorted_words = sorted(vocabulary, key=lambda w: word_topics[w][topic], reverse=True)
    
    for i in range(20):
        w = sorted_words[i]
        print("{}\t{}".format(word_topics[w][topic], w))

def print_all_topics():
    for topic in range(NUMBER_OF_TOPICS_PRINTED):
        sorted_words = sorted(vocabulary, key=lambda w: word_topics[w][topic], reverse=True)
        print(" ".join(sorted_words[:20]))


def PyMallet_LDA(docs, n_topics, stoplist = set()):
    word_pattern = re.compile("\w[\w\-\']*\w|\w")
    word_counts = Counter()

    documents = []
    word_topics = {}
    topic_totals = np.zeros(n_topics)


    for line in docs:
        #line = line.lower()
        
        tokens = word_pattern.findall(line)
        
        ## remove stopwords, short words, and upper-cased words
        tokens = [w for w in tokens if not w in stoplist and len(w) >= 3 and not w[0].isupper()]
        word_counts.update(tokens)
        
        doc_topic_counts = np.zeros(n_topics)
        token_topics = []
        
        for w in tokens:
            
            ## Generate a topic randomly
            topic = random.randrange(n_topics)
            token_topics.append({ "word": w, "topic": topic })
            
            ## If we haven't seen this word before, initialize it
            if not w in word_topics:
                word_topics[w] = np.zeros(n_topics)
            
            ## Update counts: 
            word_topics[w][topic] += 1
            topic_totals[topic] += 1
            doc_topic_counts[topic] += 1
        
        documents.append({ "original": line, "token_topics": token_topics, "topic_counts": doc_topic_counts })

    ## Now that we're done reading from disk, we can count the total
    ##  number of words.
    vocabulary = list(word_counts.keys())
    vocabulary_size = len(vocabulary)

    num_iterations = 100
    sample(documents, vocabulary_size, word_topics, topic_totals, word_counts, num_iterations, n_topics, doc_smoothing = 0.5, word_smoothing = 0.01)

    return vocabulary, word_topics

def write_PyMallet_LDA(vocabulary, word_topics, n_topics, n_words_per_topic, fileName):
    """ Writes the results of PyMallet LDA to files and returns the resulting topics as
        stings in topicList.
    """
    outputFile = open(fileName, 'w')
    outputFile.write("File: " + fileName +"\n\n")
    rawFileName = "raw_"+fileName
    outputFileRaw = open(rawFileName, 'w')
    outputFileRaw.write("File: " + rawFileName +"\n\n")
    topicList = []
    for topic in range(n_topics):
        sorted_words = sorted(vocabulary, key=lambda w: word_topics[w][topic], reverse=True)
        topicStr = " ".join(sorted_words[:n_words_per_topic])
        topicList.append(topicStr)
        outputFile.write(topicStr+"\n")
        outputFileRaw.write(topicStr+"\n")
        #print(topicStr)
        for i in range(n_words_per_topic):
            w = sorted_words[i]
            #print("{}\t{}".format(word_topics[w][topic], w))
            outputFileRaw.write("{}\t{}".format(word_topics[w][topic], w) +"\n")
        
    outputFile.close()
    outputFileRaw.close()
    return topicList

## functions used in the topic coherence metric calculations
import math

EPSILON = 0.000000001

def calculateTopicCoherenceMetrics(documentsList, topicsList, stopWordDict = {}):
    """ Calculates and returns the topic coherence metrics: averagePMI, averageLCP, averageNZ
        for the set of topics in topicsList and the reference corpus in documentsList
    """
    outputFileName = "TC_metrics_.txt"

    coOccurrenceDict = {}
    wordDict = {}
    topicsList, topicsCoOccurrenceList, coOccurrenceDict, wordDict = findcoOoccurrencesAndWordsInTopics(topicsList)

    numberOfTopics = len(topicsList)
    
    docCount = tallycoOoccurrencesAndWordsInDocs(documentsList, coOccurrenceDict, wordDict)

    makeProbabilities(docCount, coOccurrenceDict, wordDict)

    outputFile = open(outputFileName, 'w')

    outputFile.write("File: "+outputFileName+"\n\n")
    
    sumPMI = 0.0
    sumLCP = 0.0
    sumNZ = 0
    index = 0
    for topicCoOccurrence in topicsCoOccurrenceList:
        topicPMI = calculateTopicPMI(topicCoOccurrence, coOccurrenceDict, wordDict)
        topicLCP = calculateTopicLCP(topicCoOccurrence, coOccurrenceDict, wordDict)
        topicNZ = calculateTopicNZ(topicCoOccurrence, coOccurrenceDict)
        outputFile.write(topicsList[index]+"\n")
        outputFile.write("PMI = %.3f  " % (topicPMI))
        outputFile.write("LCP = %.3f  " % (topicLCP))
        outputFile.write("NZ = %d\n" % (topicNZ))
        sumPMI += topicPMI
        sumLCP += topicLCP
        sumNZ += topicNZ
        index += 1
    averagePMI = sumPMI/numberOfTopics
    averageLCP = sumLCP/numberOfTopics
    averageNZ = sumNZ/numberOfTopics
    outputFile.write("\nAverage PMI of all topics: %.3f\n" % (averagePMI))
    outputFile.write("\nAverage LCP of all topics: %.3f\n" % (averageLCP))
    outputFile.write("\nAverage NZ of all topics: %.3f\n" % (averageNZ))
    outputFile.close()
    return averagePMI, averageLCP, averageNZ

def makeProbabilities(docCount, coOccurrenceDict, wordDict):
    """ Converses the raw counts in the coOccurrenceDict and wordDict into probabilities."""
    for coOccurrence in coOccurrenceDict:
        coOccurrenceDict[coOccurrence] /= float(docCount)
    for word in wordDict:
        wordDict[word] /= float(docCount)

def calculateTopicPMI(topicCoOccurrenceList, coOccurrenceDict, wordDict):
    """ Calculates and returns a topic's total PMI. """ 
    sumPMI = 0.0
    for topicCoOccurrence in topicCoOccurrenceList:
        sumPMI += calculatePMI(topicCoOccurrence, coOccurrenceDict, wordDict)
    return sumPMI/len(topicCoOccurrenceList)

def calculateTopicLCP(topicCoOccurrenceList, coOccurrenceDict, wordDict):
    """ Calculates and returns a topic's total LCP. """ 
    sumLCP = 0.0
    for topicCoOccurrence in topicCoOccurrenceList:
        firstWord, secondWord = topicCoOccurrence
        sumLCP += calculateLCP(firstWord, topicCoOccurrence, coOccurrenceDict, wordDict)
        sumLCP += calculateLCP(secondWord, topicCoOccurrence, coOccurrenceDict, wordDict)
    return sumLCP/(2*len(topicCoOccurrenceList))

def calculateTopicNZ(topicCoOccurrenceList, coOccurrenceDict):
    """ Calculates and returns a topic's total NZ. """ 
    sumNZ = 0
    for topicCoOccurrence in topicCoOccurrenceList:
        if coOccurrenceDict[topicCoOccurrence] == 0.0:
            sumNZ += 1
    return sumNZ

def calculatePMI(topicCoOccurrence, coOccurrenceDict, wordDict):
    """ Calculates and returns the PMI for a pair of words in the topicCoOccurrence tuple. """
    wordI, wordJ = topicCoOccurrence
    PMI = math.log((coOccurrenceDict[topicCoOccurrence]+EPSILON)/(wordDict[wordI]*wordDict[wordJ]),10)
    return PMI
        
        
def calculateLCP(word, topicCoOccurrence, coOccurrenceDict, wordDict):
    """ Calculates and returns the LCP for a word in the pair of words in the topicCoOccurrence tuple. """
    LCP = math.log((coOccurrenceDict[topicCoOccurrence]+EPSILON)/(wordDict[word]),10)
    return LCP
                
def tallycoOoccurrencesAndWordsInDocs(documentsList, coOccurrenceDict, wordDict):
    """ Tallys across all the documents in documentsList the word pair co-occurrences in coOccurrenceDict, and
        individual words in wordDict."""
    docCount = 0
    for document in documentsList:
        emptyDoc = tallyCoOccurrencesInDoc(document, coOccurrenceDict, wordDict)
        if not emptyDoc:
            docCount += 1
    return docCount

def tallyCoOccurrencesInDoc(document, coOccurrenceDict, wordDict):
    """ Tallys for an individual document the word pair co-occurrences in coOccurrenceDict, and
        individual words in wordDict."""
    docCoOccurrenceDict = {}
    docWordDict = {}
    
    wordList = document.strip().split()
    if len(wordList) == 0:
        return True   # empty document
    
    # eliminate duplicate words by converting to a set and back
    wordSet = set(wordList)
    wordList = list(wordSet)

    wordList.sort()
    for first in range(len(wordList)):
        if wordList[first] in wordDict:
            wordDict[wordList[first]] += 1
        for second in range(first+1,len(wordList)):
            coOccurrenceTuple = (wordList[first], wordList[second])
            if coOccurrenceTuple in coOccurrenceDict:
                coOccurrenceDict[coOccurrenceTuple] += 1
    return False   # not empty document

def findcoOoccurrencesAndWordsInTopics(topicsList):
    """ Processes the topics file and returns:
        topicsList - list of strings with one whole topic as a string,
        topicsCoOccurrenceList - a list-of-lists with the inner-list being the list word pairs as tuples within a topic,
        coOccurrenceDict - keys are tuple of word pairs that co-occur in the topics with their associated values of 0,
        wordDict - keys are words that occur in the topics with their associate values of 0."""

    topicsCoOccurrenceList = []
    coOccurrenceDict = {}
    wordDict = {}
    topicTupleList = []
    for line in topicsList:
        topicTupleList = []
        wordList = line.strip().split()
        wordList.sort()
        for first in range(len(wordList)):
            wordDict[wordList[first]] = 0
            for second in range(first+1,len(wordList)):
                coOccurrenceTuple = (wordList[first], wordList[second])
                coOccurrenceDict[coOccurrenceTuple] = 0
                topicTupleList.append(coOccurrenceTuple)
        topicsCoOccurrenceList.append(topicTupleList)
    return topicsList, topicsCoOccurrenceList,coOccurrenceDict, wordDict

def tallyTriOccurrencesInWindow(document, windowSize, triOccurrenceDict, wordFreqDict, stopWordDict):
    """ Tally the tri-occurrences of non-stop words in all documents of a given window size. """
    wordList = document

    initialChuckSize = min(len(wordList), windowSize)

    # process initial window size or whole line if it is smaller than window size
    for first in range(initialChuckSize-2):
        if wordList[first] in wordFreqDict:
            wordFreqDict[wordList[first]] += 1
        else:
            wordFreqDict[wordList[first]] = 1
            
        for second in range(first+1,initialChuckSize-1):          
            for third in range(second+1,initialChuckSize):          
                if wordList[first] != wordList[second] and \
                   wordList[first] != wordList[third] and \
                   wordList[second] != wordList[third] and \
                   wordList[first] not in stopWordDict and \
                   wordList[second] not in stopWordDict and \
                   wordList[third] not in stopWordDict:
                    words = [wordList[first],wordList[second],wordList[third]]
                    words.sort()
                    triOccurrenceTuple = (words[0], words[1], words[2])
                    if triOccurrenceTuple in triOccurrenceDict:
                        triOccurrenceDict[triOccurrenceTuple] += 1
                    else:
                        triOccurrenceDict[triOccurrenceTuple] = 1

    # slide the window down the whole length of the line
    for nextWordIndex in range(windowSize, len(wordList)):
        if wordList[nextWordIndex] in wordFreqDict:
            wordFreqDict[wordList[nextWordIndex]] += 1
        else:
            wordFreqDict[wordList[nextWordIndex]] = 1
        for second in range(nextWordIndex -1, nextWordIndex-windowSize+2, -1):
            for third in range(second-1,nextWordIndex-windowSize+1, -1):          
                if wordList[nextWordIndex] != wordList[second] and \
                   wordList[nextWordIndex] != wordList[third] and \
                   wordList[second] != wordList[third] and \
                   wordList[nextWordIndex] not in stopWordDict and \
                   wordList[second] not in stopWordDict and \
                   wordList[third] not in stopWordDict:
                    words = [wordList[nextWordIndex],wordList[second],wordList[third]]
                    words.sort()
                    triOccurrenceTuple = (words[0], words[1], words[2])
                    if triOccurrenceTuple in triOccurrenceDict:
                        triOccurrenceDict[triOccurrenceTuple] += 1
                    else:
                        triOccurrenceDict[triOccurrenceTuple] = 1

def tallyCoOccurrencesInWindow(document, windowSize, coOccurrenceDict, wordFreqDict, stopWordDict):
    """ Tally the co-occurrences of non-stop words in all documents of a given window size. """
    wordList = document

    initialChuckSize = min(len(wordList), windowSize)

    # process initial window size or whole line if it is smaller than window size
    for first in range(initialChuckSize):
        if wordList[first] in wordFreqDict:
            wordFreqDict[wordList[first]] += 1
        else:
            wordFreqDict[wordList[first]] = 1
            
        for second in range(first+1,initialChuckSize):          
            if wordList[first] != wordList[second] and \
               wordList[first] not in stopWordDict and \
               wordList[second] not in stopWordDict:
                if wordList[first] < wordList[second]:
                    coOccurrenceTuple = (wordList[first], wordList[second])
                elif wordList[first] > wordList[second]:
                    coOccurrenceTuple = (wordList[second], wordList[first])
                if coOccurrenceTuple in coOccurrenceDict:
                    coOccurrenceDict[coOccurrenceTuple] += 1
                else:
                    coOccurrenceDict[coOccurrenceTuple] = 1

    # slide the window down the whole length of the line
    for nextWordIndex in range(windowSize, len(wordList)):
        if wordList[nextWordIndex] in wordFreqDict:
            wordFreqDict[wordList[nextWordIndex]] += 1
        else:
            wordFreqDict[wordList[nextWordIndex]] = 1
        for otherWordIndex in range(nextWordIndex-windowSize+1, nextWordIndex):
            if wordList[nextWordIndex] != wordList[otherWordIndex] and \
               wordList[nextWordIndex] not in stopWordDict and \
               wordList[otherWordIndex] not in stopWordDict:
                if wordList[nextWordIndex] < wordList[otherWordIndex]:
                    coOccurrenceTuple = (wordList[nextWordIndex], wordList[otherWordIndex])
                elif wordList[nextWordIndex] > wordList[otherWordIndex]:
                    coOccurrenceTuple = (wordList[otherWordIndex], wordList[nextWordIndex])
                if coOccurrenceTuple in coOccurrenceDict:
                    coOccurrenceDict[coOccurrenceTuple] += 1
                else:
                    coOccurrenceDict[coOccurrenceTuple] = 1

In [None]:
""" File:  P2_unsupervised_topic_modeling.py  

    Description:  Loads a previously created preprocessed chat corpus, then performs
    topic modeling utilizing unsupervised techniques of:
    1) Latent Semantic Analysis (TF-IDF & LSA) using gensim
    2) probabilistic Latent Semantic Analysis (TF-IDF & pLSA) using gensim
    3) Latent Dirichlet Allocation (LDA) using scikit-learn.org (sklearn) LDA module
    4) Latent Dirichlet Allocation (LDA), PyMallet
    Here we are used the LDA implementation from GitHub PyMallet at:
    https://github.com/mimno/PyMallet
    The LDA code below is based on their lda_reference.py code written in Python
    The PyMallet project has an MIT License see below.
================================================================================
MIT License

Copyright (c) 2019 mimno

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
===========================================================================
    INPUT FILES:  User inputs file to process
    Previously created preprocessed chat corpus from P1_preprocess_data.py either:
    1) wholeChatsFilePOS_N_ADJ_V.txt -- preprocessing keeping nouns, adjectives, and verbs
    2) wholeChatsFilePOS_N_ADJ.txt -- preprocessing keeping nouns and adjectives
    3) wholeChatsFile.txt -- NO POS preprocessing so all parts of speech
    4) onlyQuestionsFile.txt -- Only initial question of chats

    OUTPUT FILES for each of the 4 unsupervised topic modeling techniques:
    1) "raw_" text (.txt) file listing topics with each word scored
    2) "LDA_" text (.txt) file containing only the text for the
       specified number of topics with the specified number of words per topic

    OUTPUT FILES for to aid the semi-supervised topic modeling techniques of Phase 3:
    1) possible_2_word_anchors.txt most frequent 2-word occurrence across combined topics
       of all four unsupervised topic modeling techniques
    2) possible_3_word_anchors.txt most frequent 3-word occurrence across combined topics
       of all four unsupervised topic modeling techniques      
    
"""
import os.path
from pprint import pprint  # pretty-printer
from collections import defaultdict
from gensim import corpora
from gensim import models
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from time import time

#from P2_utility_functions import *

def main():
    # ask users to input the name of the csv file cleaned, make sure it contains the column of 'body'
    print('Welcome to Phase 2 which runs the unsupervised topic modeling techniques.',
          '\n\nYou should have first run Phase 1 to pre-process your chat data.',
          '\nIt would generate cleaned chat files varying the parts of speech or question-only.',
          '\nFiles generated are: wholeChatsFile.txt, wholeChatsFilePOS_N_ADJ_V.txt,',
          '\nwholeChatsFilePOS_N_ADJ.txt, and onlyQuestionsFile.txt.\n')

    prompt = "\nStep 1. Please input the pre-processed (.txt) file." + \
             '\n(For example: "wholeChatsFile.txt"):'
    fileName = getFileName(prompt)
    chats = readChatCorpusFile(fileName)

    modelDict = {'PyMallet LDA':run_PyMallet_LDA, 'LDA':runLDA,
                 'TF-IDF & LSA':run_TFIDF_LSA, 'TF-IDF & pLSA':run_TFIDF_pLSA}

    n_topics = getPositiveInteger('\nStep 2. Please specify the number of topics. (suggested range 10-20)\n')
    n_words_per_topic = getPositiveInteger('\nStep 3. Please specify the number of words per topics. (suggested range 5-10)\n')

    combinedTopicsAcrossAllTechniques = []
    for model in modelDict:
        print("="*35)
        print("\nPerforming", model,"topic modeling -- please wait it might take a couple minutes!")
        topicList = modelDict[model](chats, n_topics, n_words_per_topic)
        averagePMI, averageLCP, averageNZ = calculateTopicCoherenceMetrics(chats, topicList)
        print("\nResults for",model," TC-PMI %3.3f, TC-LCP %3.3f, TC-NZ %3.3f:" % (averagePMI, averageLCP, averageNZ))
        for topic in topicList:
            print(topic)
        combinedTopicsAcrossAllTechniques.extend(topicList)

    # generate files of possible anchors for semi-supervised topic modeling techniques
    coOccurrenceDict, triOccurrenceDict = generate_Co_and_Tri_occurrence_dictionary(combinedTopicsAcrossAllTechniques,n_words_per_topic)
    writeOccurrenceFile(2, coOccurrenceDict)
    writeOccurrenceFile(3, triOccurrenceDict)

def runLDA(documents,n_topics, n_words_per_topic, max_features=1000, stop_words='english'):
    """ Performs LDA topic modeling and return resulting topics as strings in topicList """
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()
    # Fit the LDA model
    lda_model = LatentDirichletAllocation(n_topics, max_iter=50, learning_method='online',
                                    learning_decay = 0.7,
                                    learning_offset=50.,
                                    random_state=0)
    lda_fit = lda_model.fit(tf)
    lda_output = lda_model.transform(tf)

    fileName = "LDA_"+"_"+str(n_topics)+"topics_"+str(n_words_per_topic)+"words.txt"
    topicList = write_file_top_words(lda_fit, tf_feature_names, n_words_per_topic, fileName)
    return topicList

def run_TFIDF_pLSA(documents,n_topics, n_words_per_topic, max_features=1000, stop_words='english'):
    """ Performs TF-IDF and pLSA topic modeling and return resulting topics as strings in topicList """
    # Vectorize raw documents to tf-idf matrix: 
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
    tfidf = tfidf_vectorizer.fit_transform(documents)
    nmf = NMF(n_components=n_topics, random_state=1,
              beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
              l1_ratio=.5).fit(tfidf)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    fileName = "TFIDF_pLSA_"+str(n_topics)+"topics_"+str(n_words_per_topic)+"words.txt"
    topicList = write_file_top_words(nmf, tfidf_feature_names, n_words_per_topic, fileName)
    return topicList

def run_PyMallet_LDA(documents, n_topics, n_words_per_topic, fileNameCorpus=""):
    """ Performs PyMallet LDA topic modeling and return resulting topics as strings in topicList """
    vocabulary, word_topics = PyMallet_LDA(documents, n_topics)
    fileName = "PyMallet_LDA_"+fileNameCorpus+"_"+str(n_topics) \
               +"topics_"+str(n_words_per_topic)+"words.txt"

    topicList = write_PyMallet_LDA(vocabulary, word_topics, n_topics, n_words_per_topic, fileName)

    return topicList

def run_TFIDF_LSA(documents,n_topics, n_words_per_topic):
    """ Performs TF-IDF and LSA topic modeling and return resulting topics as strings in topicList """
    stoplist = set()  # preprocessing removed stop words already...
    dictionary, corpus = createCorpusDictionary(documents, stoplist)
    tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model

    # Apply a transformation to a whole corpus 
    corpus_tfidf = tfidf[corpus]

    # Initialize an LSI transformation
    numberOfTopics = 300 # (recommended between 200-500)
    lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics=numberOfTopics)
    # create a double wrapper over chat corpus: bow -> tfidf -> fold-in-lsi
    corpus_lsi = lsi[corpus_tfidf]
    fileName = "TFIDF_LSA_"+str(n_topics)+"topics_"+str(n_words_per_topic)+"words.txt"
    topicList = write_LSA(lsi, n_topics, n_words_per_topic, fileName)
    return topicList
    
def generate_Co_and_Tri_occurrence_dictionary(combinedTopicsAcrossAllTechniques,n_words_per_topic):
    """ To aid the semi-supervised topic modeling techniques of Phase 3, determines the
        co-occurrences (2-words) and tri-occurrences across combined topics of all four
        unsupervised topic modeling techniques.
    """
    coOccurrenceDict = {}
    triOccurrenceDict = {}
    wordFreqDict = {}
    wordFreqDict2 = {}
    windowSize = n_words_per_topic
    stopWordDict = {}  #stop words previously removed
    combinedTopicsFile = open("combinedTopicsFile.txt", 'w')
    for topic in combinedTopicsAcrossAllTechniques:
        document = topic.split()
        tallyTriOccurrencesInWindow(document, windowSize, triOccurrenceDict, wordFreqDict, stopWordDict)
        tallyCoOccurrencesInWindow(document, windowSize, coOccurrenceDict, wordFreqDict2, stopWordDict)
        combinedTopicsFile.write(topic+"\n")
    combinedTopicsFile.close()
    return coOccurrenceDict, triOccurrenceDict

def writeOccurrenceFile(occurrenceSize, occurrenceDict):
    """ Called twice to generate two files to aid Phase 3 semi-supervised topic modeling:
        1) possible_2_word_anchors.txt most frequent 2-word occurrence across combined
           topics of all four unsupervised topic modeling techniques, and
        2) possible_3_word_anchors.txt most frequent 3-word occurrence across combined
           topics of all four unsupervised topic modeling techniques.
    """
    occurrencesFile = open("possible_"+str(occurrenceSize)+"_word_anchors.txt", 'w')
    occurrencesFile.write("Possible "+str(occurrenceSize)+" word anchors for semi-supervised topic modeling.\n")
    occurrencesFile.write("Found from most frequently occuring "+ str(occurrenceSize)+ "-word occurrences from\n" +
                          "all topics found by supervised topic modeling techniques:\n" +
                          "LDA, PyMallet_LDA, pLSA, and LSA\n\n")
    countList = []
    for wordTuple, count in occurrenceDict.items():
        countList.append((count, wordTuple))
    countList.sort()
    countList.reverse()
    numberToSee = min(len(countList), 50)
    for index in range(numberToSee):
        count, wordTuple = countList[index]
        occurrencesFile.write("tuple count: %d  words %s\n" % (count, str(wordTuple)))
    occurrencesFile.close()
    

main()   

Welcome to Phase 2 which runs the unsupervised topic modeling techniques. 

You should have first run Phase 1 to pre-process your chat data. 
It would generate cleaned chat files varying the parts of speech or question-only. 
Files generated are: wholeChatsFile.txt, wholeChatsFilePOS_N_ADJ_V.txt, 
wholeChatsFilePOS_N_ADJ.txt, and onlyQuestionsFile.txt.


Step 1. Please input the pre-processed (.txt) file.
(For example: "wholeChatsFile.txt"): /content/wholeChatsFile.txt

Step 2. Please specify the number of topics. (suggested range 10-20)
 10

Step 3. Please specify the number of words per topics. (suggested range 5-10)
 5

Performing PyMallet LDA topic modeling -- please wait it might take a couple minutes!

Results for PyMallet LDA  TC-PMI 0.000, TC-LCP 0.000, TC-NZ 0.000:
rod how book would link
rod book check article study
will day find check article
how search rod online work
what access article send answer
find article link moment would
rod how would link place
ill journal questio




Results for LDA  TC-PMI 0.000, TC-LCP 0.000, TC-NZ 0.000:
rod article check search book
rod article search book check
rod article book link wa
rod article book search check
rod article search link work
rod article search book check
rod article link check search
rod article book search check
rod book article link search
rod search book link article

Performing TF-IDF & LSA topic modeling -- please wait it might take a couple minutes!


ValueError: ignored

In [14]:
#!pip install corextopic
#!pip install lda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lda
  Downloading lda-2.0.0-cp37-cp37m-manylinux1_x86_64.whl (351 kB)
[K     |████████████████████████████████| 351 kB 3.4 MB/s 
Collecting pbr<4,>=0.6
  Downloading pbr-3.1.1-py2.py3-none-any.whl (99 kB)
[K     |████████████████████████████████| 99 kB 8.7 MB/s 
[?25hInstalling collected packages: pbr, lda
Successfully installed lda-2.0.0 pbr-3.1.1


In [18]:
#!pip show lda

#For Workaround: Drop the Python Files from GuidedLDA_WorkAround Git Repo in the lda folder

In [28]:
#P3 utility functions



""" File:  P3_utility_functions.py  

    Description:  Utility functions to performs
    semi-supervised topic modeling utilizing CorEx and GuidedLDA.

    Acknowledgements:

    Here we are used the CorEx (Correlation Explanation) package available at GitHub:
    https://github.com/gregversteeg/corex_topic

    Here we are used the GuidedLDA package is available at GitHub:
    https://github.com/vi3k6i5/GuidedLDA
    NOTE:  We had difficulty installing GuidedLDA, but we were finally successful
    by following the work-around posted at:
    https://github.com/dex314/GuidedLDA_WorkAround

"""
import os.path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from corextopic import corextopic as ct
import pandas as pd
import nltk
from time import time

import re, sys, random, math
import numpy as np
from lda import guidedlda as glda
from lda import glda_datasets as gldad

from collections import Counter
from timeit import default_timer as timer


def readAnchorsFile(fileName):
    """ Reads anchor/seeds from fileName and returns list-of-lists anchorList """
    anchorList = []
    anchorFile = open(fileName, 'r')
    for line in anchorFile:
        wordList = line.strip().split()
        if len(wordList) > 0:
            anchorList.append(wordList)
    anchorFile.close()

    return anchorList

def run_GuidedLDA(chats, anchorList, n_topics, n_words_per_topic,SEED_CONFIDENCE=0.75):
    """ Perform GuidedLDA on corpus from chats using anchorList.
        Returns topics as strings in topicList.
    """
    word2id = {}
    docs = []
    id2word = {}
    wordList = []
    wordId = 0
    for documentLine in chats:
        newDoc = ""
        for word in documentLine.split():
            if word not in word2id:
                word2id[word] = wordId
                id2word[wordId] = word
                wordList.append(word)
                wordId += 1
            newDoc += word + " "
        if len(newDoc) > 0:
            docs.append(newDoc)
    numDocs = len(docs)
    numWords = len(word2id)
    vocab = tuple(wordList)

    X = np.ndarray(shape=(numDocs, numWords), dtype=int)

    word_counts = Counter()
    documents = []
    word_topics = {}
    topic_totals = np.zeros(n_topics)

    for docIndex, docLine in enumerate(docs):
        
        for word in docLine.strip().split():
            wordId = word2id[word]
            X[docIndex][wordId] += 1

    seed_topic_list = anchorList
    model = glda.GuidedLDA(n_topics=n_topics, n_iter=100,
                           random_state=7, refresh=20)
    seed_topics = {}
    for t_id, st in enumerate(seed_topic_list):
        for word in st:
            seed_topics[word2id[word]] = t_id

    model.fit(X, seed_topics=seed_topics, seed_confidence=SEED_CONFIDENCE)

    # Display and write to file the results of CorEx with no anchors
    fileName = "GuidedLDA_seeds_"+str(len(seed_topic_list))+"_confidence_"+ \
               str(SEED_CONFIDENCE)+"_"+str(n_topics) +"topics_"+str(n_words_per_topic)+"words.txt"
    outputFile = open(fileName, 'w')
    outputFile.write("File: " + fileName +"\n\n")
    topicList = []
    topic_word = model.topic_word_
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_words_per_topic+1):-1]
        topicStr = '{}'.format(' '.join(topic_words))
        topicList.append(topicStr)     
        outputFile.write(topicStr+"\n")
    outputFile.close()
    return topicList

def run_CorEx(documents, anchorList, n_topics, n_words_per_topic):
    """ Performs CorEx on corpus documents using anchorList.
        Returns topics as strings in topicList.
    """
    # CorEx uses an TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_df=.5, min_df=10, max_features=None,
    ##    ngram_range=(1, 2),  for bi-grams
    ##    ngram_range=(1,3),   for bi-grams and tri-grams
        ngram_range=(1,1),     # for no bi-grams or tri-grams
        norm=None,
        binary=True,
        use_idf=False,
        sublinear_tf=False
    )

    # Fit chat corpus to TF-IDF vectorization
    vectorizer = vectorizer.fit(documents)
    tfidf = vectorizer.transform(documents)
    vocab = vectorizer.get_feature_names()

    # Apply CorEx with no anchors for a comparison
    anchors = []
    model = ct.Corex(n_hidden=n_topics, seed=42) # n_hidden specifies the # of topics
    model = model.fit(tfidf, words=vocab)

    # Display and write to file the results of CorEx with no anchors
    fileName = "CorEx_no_anchors_"+str(n_topics)+"topoics_"+str(n_words_per_topic)+"words.txt"
    outputFile = open(fileName, 'w')
    outputFile.write("File: " + fileName +"\n\n")

    print("\nCorEx Topics with no anchors:")
    for i, topic_ngrams in enumerate(model.get_topics(n_words=n_words_per_topic)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))
        outputFile.write("{}".format(" ".join(topic_ngrams))+"\n")
    outputFile.close()

    ## remove anchor words that are not in the chat corpus
    anchors = [
        [a for a in topic if a in vocab]
        for topic in anchorList
    ]

    model = ct.Corex(n_hidden=n_topics, seed=42)
    model = model.fit(
        tfidf,
        words=vocab,
        anchors=anchors, # Pass the anchors in here
        anchor_strength=3 # Tell the model how much it should rely on the anchors
    )

    # Display and write to file the results of CorEx with no anchors
    fileName = "CorEx_anchors_"+str(len(anchors))+"_"+str(n_topics) \
               +"topoics_"+str(n_words_per_topic)+"words.txt"
    outputFile = open(fileName, 'w')
    outputFile.write("File: " + fileName +"\n\n")
    topicList = []
    print("\nCorEx Topics with anchors:")
    for i, topic_ngrams in enumerate(model.get_topics(n_words=n_words_per_topic)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        topicList.append(" ".join(topic_ngrams))
        print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))
        outputFile.write("{}".format(" ".join(topic_ngrams))+"\n")
    outputFile.close()    
    return topicList

In [30]:
#P3

""" File:  P3_semi_supervised_topic_modeling.py  

    Description:  Loads a previously created pre-processed chat corpus, then performs
    semi-supervised topic modeling utilizing CorEx and GuidedLDA.

    INPUT FILES:
    0) anchors.txt - anchor/seed words each on their own line
    
    Previously created preprocessed chat corpus from either:
    1) wholeChatsFilePOS_N_ADJ_V.txt -- preprocessing keeping nouns, adjectives, and verbs
    2) wholeChatsFilePOS_N_ADJ.txt -- preprocessing keeping nouns and adjectives
    3) wholeChatsFile.txt -- NO POS preprocessing so all parts of speech
    4) onlyQuestionsFile.txt -- Only initial question of chats

    OUTPUT FILES:
    1) "raw_" text (.txt) file listing topics with each word scored
    2) "LDA_" text (.txt) file containing only the text for the
       specified number of topics with the specified number of words per topic

    Acknowledgements:

    Here we are used the CorEx (Correlation Explanation) package available at GitHub:
    https://github.com/gregversteeg/corex_topic

    Here we are used the GuidedLDA package is available at GitHub:
    https://github.com/vi3k6i5/GuidedLDA
    NOTE:  We had difficulty installing GuidedLDA, but we were finally successful
    by following the work-around posted at:
    https://github.com/dex314/GuidedLDA_WorkAround

"""
import os.path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from corextopic import corextopic as ct
import pandas as pd
import nltk
from time import time

import re, sys, random, math
import numpy as np
from lda import guidedlda as glda
from lda import glda_datasets as gldad

from collections import Counter
from timeit import default_timer as timer


# from P2_utility_functions import *
# from P3_utility_functions import *

def main():
    print('Welcome to Phase 3 which runs the semi-supervised topic modeling techniques.',
          '\n\nYou should have first run Phase 1 to pre-process your chat data.',
          '\nIt would generate cleaned chat files varying the parts of speech or question-only.',
          '\nFiles generated are: wholeChatsFile.txt, wholeChatsFilePOS_N_ADJ_V.txt,',
          '\nwholeChatsFilePOS_N_ADJ.txt, and onlyQuestionsFile.txt.\n\n')
    print('\n\nYou could have also run Phase 2 to execute unsupervised topic modeling techniques.',
          '\nIt would generate files: possible_2_word_anchors.txt and possible_3_word_anchors.txt which',
          '\nyou might use to create a text-file (.txt) with anchors one per line.\n')

    prompt = "\nStep 1. Please input the pre-processed (.txt) file." + \
             '\n(For example: "wholeChatsFile.txt"):'
    fileName = getFileName(prompt)
    chats = readChatCorpusFile(fileName)

    prompt = "\nStep 2. Please input the anchors/seeds (.txt) file." + \
             '\n(For example: "anchors.txt"):'
    fileName = getFileName(prompt)
    anchorList = readAnchorsFile(fileName)

    modelDict = {'GuidedLDA':run_GuidedLDA,'CorEx':run_CorEx}

    n_topics = getPositiveInteger('\nStep 3. Please specify the number of topics. (suggested range 10-20)\n')
    n_words_per_topic = getPositiveInteger('\nStep 4. Please specify the number of words per topics. (suggested range 5-10)\n')

    for model in modelDict:
        print("="*35)
        print("\nPerforming", model,"topic modeling -- please wait it might take a couple minutes!")
        topicList = modelDict[model](chats, anchorList, n_topics, n_words_per_topic)
        averagePMI, averageLCP, averageNZ = calculateTopicCoherenceMetrics(chats, topicList)
        print("\nResults for",model," TC-PMI %3.3f, TC-LCP %3.3f, TC-NZ %3.3f:" % (averagePMI, averageLCP, averageNZ))
        for topic in topicList:
            print(topic)
        

       
main()

Welcome to Phase 3 which runs the semi-supervised topic modeling techniques. 

You should have first run Phase 1 to pre-process your chat data. 
It would generate cleaned chat files varying the parts of speech or question-only. 
Files generated are: wholeChatsFile.txt, wholeChatsFilePOS_N_ADJ_V.txt, 
wholeChatsFilePOS_N_ADJ.txt, and onlyQuestionsFile.txt.




You could have also run Phase 2 to execute unsupervised topic modeling techniques. 
It would generate files: possible_2_word_anchors.txt and possible_3_word_anchors.txt which 
you might use to create a text-file (.txt) with anchors one per line.


Step 1. Please input the pre-processed (.txt) file.
(For example: "wholeChatsFile.txt"): /content/wholeChatsFile.txt

Step 2. Please input the anchors/seeds (.txt) file.
(For example: "anchors.txt"): /content/anchors.txt

Step 3. Please specify the number of topics. (suggested range 10-20)
 10

Step 4. Please specify the number of words per topics. (suggested range 5-10)
 5

Performing G



ValueError: ignored