In [1]:
import os, re, string
from nltk import FreqDist, sent_tokenize, word_tokenize, pos_tag, pos_tag_sents
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet as wn
from nltk.data import load
import numpy as np
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel

In [2]:
class LDA_Model:

    # constructor
    def __init__(self, directory, output, params):
        # set class member variables
        self.directory = directory + '/'
        self.output = output
        self.params = self.__get_params(params)

        # initialize helpful variables
        self.docs = self.__get_documents(directory)
        self.num_docs = len(self.docs)
        self.swlist = self.__get_stopwords(self.params[6])
        
        # get tokens and sentences from corpus of documents
        self.all_tokens = []
        self.all_sentences = self.__read_files()
        self.__filter_by_pos(self.params[4])
        self.__remove_stopwords(1)
        self.__filter_out_tokens(self.params[3])
        self.__lemmatize_or_stem(self.params[5])
        self.__remove_stopwords(2)
                
        # get vector model
        self.__get_vector_model(self.params[1])
        
    # private helper methods
    
    # read in and process parameters text file
    def __get_params(self, params_file_name):
        defaults = [8, 'B', "auto", 'n', 'A', 'L', 'nltk']
        
        if params_file_name == "":
            return defaults
        
        # read parameters file
        params_file = open(params_file_name, 'r') 
        params_lines = params_file.readlines() 

        params = []

        # get all parameters from file
        for param_line in params_lines: 
            param_vals = param_line.strip().split()
            if len(param_vals) == 0:
                params.append("")
            else:
                params.append(param_vals[0])
        
        # go through all params and set to default if needed
        for idx, param in enumerate(params):
            if param == "":
                params[idx] = defaults[idx]
                
        return params

    
    # get list of documents
    def __get_documents(self, path):
        return os.listdir(path)    

    
    # returns set of stopwords (empty, nltk default, or through text file)
    def __get_stopwords(self, stopword):
        regex = re.compile('[^a-z]')

        # returns set of stopwords (empty, nltk default, or through text file)
        if (stopword == "none"):
            # return empty set if no stopwords provided
            swlist = set()
        elif (stopword == "nltk"):
            # return default nltk set of english stopwords
            swlist = set(stopwords.words("english"))
        else:
            # populate stopword set from text file
            with open(stopword) as f:
                stopword_list = [regex.sub('', word) for line in f for word in re.split('[;,.\-\n ]', line) if word]
            f.close()
            swlist = set(stopword_list)

        return swlist

        # removes stopwords from list of tokens
    def __remove_stopwords(self, param):
        if param == 1:
            all_sentences = []
            for doc in self.all_sentences:
                new_doc = [[token for token in sent if token not in self.swlist] for sent in doc]
                all_sentences.append(new_doc)
            self.all_sentences = all_sentences
        elif param == 2:
            self.all_tokens = [[token for token in tokens if token not in self.swlist] for tokens in self.all_tokens]
    
    # reads in text from files
    def __read_files(self):
        all_sentences = []
        
        for file in self.docs:
            inFile = open(os.path.join(self.directory + file), 'r')
            text = inFile.read()
            sentences = sent_tokenize(text.lower())
            all_sentences.append(sentences)
        
        tokenized_sentences = [[word_tokenize(sent) for sent in sentences] for sentences in all_sentences]
        return tokenized_sentences
    
    # filter by part of speech
    def __filter_by_pos(self, param):
        tag_dict = load('help/tagsets/upenn_tagset.pickle')
        all_tags = list(tag_dict.keys())
        tokenized_sentences = self.all_sentences
        tagged_words = [pos_tag_sents(sentences) for sentences in tokenized_sentences]
        
        adjective = ['JJ', 'JJR', 'JJS']
        verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
        noun = ['NN', 'NNP', 'NNPS', 'NNS',]
        adverb = ['RB', 'RBR', 'RBS']

        if param == 'F':
            allowed_tag_list = noun + verb + adjective + adverb
        elif param == 'N':
            allowed_tag_list = noun + adjective
        elif param == 'n':
            allowed_tag_list = noun
        else:
            allowed_tag_list = all_tags
        
        all_new_tokens = []
        all_sent_dicts = []
        
        for doc in tagged_words:
            doc_words = []
            doc_dicts = []
            
            for sentence in doc:
                new_tokens = [word for word, tag in sentence if tag in allowed_tag_list] # and word not in self.swlist]
                doc_words.append(new_tokens)
                sent_dict = dict(sentence)
                doc_dicts.append(sent_dict)
                
            all_new_tokens.append(doc_words)
            all_sent_dicts.append(doc_dicts)
        
        self.all_tokens = all_new_tokens # currently in sentence form
        self.all_sent_dicts = all_sent_dicts
        
    # new preprocessing function
    def __filter_out_tokens(self, param):
        delchars = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
        all_tokens = []
        
        for doc in self.all_tokens:
            doc_tokens = []
            for sentence in doc:
                # tokens = re.split("[, \-!?:;.]+", text.lower())

                if param == 'A':
                    tokens = [token for token in sentence if token.isalnum() or len(token) > 1]
                elif param == 'a':
                    tokens = [token.translate(str.maketrans('', '', delchars)) for token in sentence if token.isalnum() or len(token) > 1]
                elif param == 'N':
                    tokens = [token for token in sentence if token.isalnum()]
                elif param == 'n':
                    tokens = [token for token in sentence if token.isalnum() and not token.isdigit()]

                doc_tokens.append(tokens)
            all_tokens.append(doc_tokens)

        self.all_tokens = all_tokens
        
    
    # lemmatize or stem tokens
    def __lemmatize_or_stem(self, param):
        if param == 'N':
            all_tokens = []
            for doc in self.all_tokens:
                flat_list = [token for sentence in doc for token in sentence]
                all_tokens.append(flat_list)
            new_tokens = all_tokens
        elif param == 'B':
            stemmer = PorterStemmer()
            all_tokens = []
            for doc in self.all_tokens:
                flat_list = [stemmer.stem(token) for sentence in doc for token in sentence]
                all_tokens.append(flat_list)
                
            # stemmed = [[stemmer.stem(token) for token in tokens] for tokens in all_tokens]
            new_tokens = all_tokens
        elif param == 'L':
            tag_map = {
                'CD':wn.NOUN, # cardinal number (one, two)             
                'EX':wn.ADV, # existential ‘there’ (there)           
                'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
                'JJ':wn.ADJ, # adjective (yellow)                  
                'JJR':wn.ADJ, # adj., comparative (bigger)          
                'JJS':wn.ADJ, # adj., superlative (wildest)                             
                'NN':wn.NOUN, # noun, sing. or mass (llama)          
                'NNS':wn.NOUN, # noun, plural (llamas)                  
                'NNP':wn.NOUN, # proper noun, sing. (IBM)              
                'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
                'PDT':wn.ADJ, # predeterminer (all, both)             
                'RB':wn.ADV, # adverb (quickly, never)            
                'RBR':wn.ADV, # adverb, comparative (faster)        
                'RBS':wn.ADV, # adverb, superlative (fastest)     
                'RP':wn.ADJ, # particle (up, off)
                'VB':wn.VERB, # verb base form (eat)
                'VBD':wn.VERB, # verb past tense (ate)
                'VBG':wn.VERB, # verb gerund (eating)
                'VBN':wn.VERB, # verb past participle (eaten)
                'VBP':wn.VERB, # verb non-3sg pres (eat)
                'VBZ':wn.VERB, # verb 3sg pres (eats)
            }
            
            lemmatizer = WordNetLemmatizer()
            
            lemmatized_tokens = []
            for idx1, doc in enumerate(self.all_tokens):
                lemmatized_doc = []
                for idx2, sentence in enumerate(doc):
                    for idx3, token in enumerate(sentence):
                        pos = self.all_sent_dicts[idx1][idx2][token]
                        if pos not in tag_map:
                            lemmatized = lemmatizer.lemmatize(token)
                        else:
                            wn_pos = tag_map[pos]
                            lemmatized = lemmatizer.lemmatize(token, wn_pos)
                        
                        lemmatized_doc.append(lemmatized)
                lemmatized_tokens.append(lemmatized_doc)
            new_tokens = lemmatized_tokens
        
        self.all_tokens = new_tokens # now in document form

    def __get_vector_model(self, param):
        # alter and return term frequency matrix based on given tf parameter
        if param == 'T':
            smartirs = 'nfn'
        elif param == 'B':
            smartirs = 'bnn'
        elif param == 't':
            smartirs = 'nnn'
        
        self.dictionary = Dictionary(self.all_tokens)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.all_tokens]
        model = TfidfModel(corpus=self.corpus, id2word=self.dictionary, smartirs=smartirs)        
        vector_model = []
        for idx, doc in enumerate(self.docs):
            vector_model.append(model[self.corpus[idx]])
        
        self.vector_model = vector_model
    
    # remove stopwords 
    def generate_model(self):
        # set training parameters.
        num_topics = self.params[0]
        alpha = self.params[2]
        corpus = self.vector_model
        dictionary = self.dictionary

        # create model
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            alpha=alpha,
            eta='auto',
            num_topics=num_topics
        )
        
        return [corpus, model]
        
    # save model to file(s)
    def save_model(self, model, corpus):
        # save entire model
        model.save(self.output + ".model")
        
        # save topic-word matrix
        total_topics = len(model.get_topics())
        
        for idx in range(0, total_topics):
            outFile = open(self.output + '_' + str(idx) + ".topic", "w")
            topic_terms = model.show_topic(idx, topn=10000000)
            for topic_term in topic_terms:
                outFile.write(topic_term[0] + ' ' + str(topic_term[1]) + '\n')
            outFile.close()
            
        # save document-topic matrix
        outFile = open(self.output + ".dt", "w")

        for idx in range(0, len(self.docs)):
            doc_topics = model.get_document_topics(corpus[idx])
            dictionary = dict(doc_topics)
            doc_probs = []
            for i in range(0, total_topics):
                if i in dictionary:
                    prob = dictionary[i]
                    doc_probs.append(prob)
                else:
                    doc_probs.append(0.0)
            line = self.docs[idx] + ' ' + ' '.join(str(doc_prob) for doc_prob in doc_probs) + '\n'
            outFile.write(line)

        outFile.close()

In [3]:
directory = "_test0"
output = "output1"
params = "params.txt"
ldamodel = LDA_Model(directory, output, params)

In [4]:
corpus, model = ldamodel.generate_model()

In [5]:
ldamodel.save_model(model, corpus)

In [None]:
### START OF NEW CODE

In [None]:
def compare_topic_sets(t_prime, u_prime):
    # do topic assignments between sets
    max_similarities = []
    for idx1, T in enumerate(t_prime):
        max_similarity = (None, 0)
        for idx2, U in enumerate(u_prime):
            t = set([token for token, prob in T])
            u = set([token for token, prob in U])
            
            # calculate jaccard coefficient between topics and store highest
            jaccard = len(t & u) / len(t | u)
            if jaccard > max_similarity[1]:
                max_similarity = (idx2, jaccard)

        max_similarities.append(max_similarity)
        
    ms = np.array(max_similarities)
    num_topics = np.shape(ms)[0]
    print("Original Topic Assignment")
    for idx in range(0, num_topics):
        print(str(idx) + ": " + str(int(ms[idx][0])) + ", " + str(ms[idx][1]))
    print()
    
    # take care of imperfect matches
    u_vals = np.unique(ms[:,0])
    for val in u_vals:
        same_topic = ms[ms[:,0] == val]
        if len(same_topic) > 1:
            max_sim = np.amax(same_topic[:,1])
            ms[(ms[:,0] == val) & (ms[:,1] != max_sim),1] = 0
    
    print("New Topic Assignment")
    for idx in range(0, num_topics):
        print(str(idx) + ": " + str(int(ms[idx][0])) + ", " + str(ms[idx][1]))
    print()
    
    topics_used = len(u_vals) / len(u_prime)
    similarity_sum = np.sum(ms[:,1])
    print("Topics Used: " + str(topics_used))
    print("Similarity Sum: " + str(similarity_sum))

In [None]:
# EXPERIMENT 1
directory = "_corpus2/"

In [None]:
ldamodel_11 = LDA_Model(directory, "output_11", "params_11.txt")
ldamodel_12 = LDA_Model(directory, "output_12", "params_12.txt")

In [None]:
corpus_11, model_11 = ldamodel_11.generate_model()
corpus_12, model_12 = ldamodel_12.generate_model()

In [None]:
k = 10

In [None]:
new_topic_terms_11 = []
for idx in range(0, int(ldamodel_11.params[0])):
    topic_terms = model_11.show_topic(idx, topn=k)
    new_topic_terms_11.append(topic_terms)

In [None]:
swlist = set(stopwords.words("english"))

new_topic_terms_12 = []
for idx in range(0, int(ldamodel_12.params[0])):
    topic_terms = model_12.show_topic(idx, topn=10000000)
    new_topic_terms = [(token, prob) for token, prob in topic_terms if token not in swlist]
    new_topic_terms_12.append(new_topic_terms[0:k])

In [None]:
compare_topic_sets(new_topic_terms_11, new_topic_terms_12)

In [None]:
compare_topic_sets(new_topic_terms_12, new_topic_terms_11)

In [None]:
# EXPERIMENT 2

In [None]:
ldamodel_21 = LDA_Model(directory, "output_21", "params_21.txt")
ldamodel_22 = LDA_Model(directory, "output_22", "params_22.txt")

In [None]:
corpus_21, model_21 = ldamodel_21.generate_model()
corpus_22, model_22 = ldamodel_22.generate_model()

In [None]:
k = 10

In [None]:
new_topic_terms_21 = []
for idx in range(0, int(ldamodel_21.params[0])):
    topic_terms = model_21.show_topic(idx, topn=k)
    new_topic_terms_21.append(topic_terms)

In [None]:
adjective = ['JJ', 'JJR', 'JJS']
verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
noun = ['NN', 'NNP', 'NNPS', 'NNS',]
adverb = ['RB', 'RBR', 'RBS']

allowed_tag_list = noun + verb + adjective + adverb

new_topic_terms_22 = []
for idx in range(0, int(ldamodel_22.params[0])):
    topic_terms = model_22.show_topic(idx, topn=10000000)
    tokens = [token for token, prob in topic_terms]
    pos_vals = pos_tag(tokens)
    
    new_topic_terms = []
    for idx, token in enumerate(tokens):
        if pos_vals[idx][1] in allowed_tag_list:
            new_topic_terms.append((token, topic_terms[idx][1]))
    new_topic_terms_22.append(new_topic_terms[0:k])

In [None]:
compare_topic_sets(new_topic_terms_21, new_topic_terms_22)

In [None]:
compare_topic_sets(new_topic_terms_22, new_topic_terms_21)

In [None]:
# EXPERIMENT 3

In [None]:
ldamodel_31 = LDA_Model(directory, "output_31", "params_31.txt")
ldamodel_32 = LDA_Model(directory, "output_32", "params_32.txt")

In [None]:
corpus_31, model_31 = ldamodel_31.generate_model()
corpus_32, model_32 = ldamodel_32.generate_model()

In [None]:
k = 10

In [None]:
new_topic_terms_31 = []
for idx in range(0, int(ldamodel_31.params[0])):
    topic_terms = model_31.show_topic(idx, topn=k)
    new_topic_terms_31.append(topic_terms)

In [None]:
noun = ['NN', 'NNP', 'NNPS', 'NNS',]

allowed_tag_list = noun

new_topic_terms_32 = []
for idx in range(0, int(ldamodel_32.params[0])):
    topic_terms = model_32.show_topic(idx, topn=10000000)
    tokens = [token for token, prob in topic_terms]
    pos_vals = pos_tag(tokens)
    
    new_topic_terms = []
    for idx, token in enumerate(tokens):
        if pos_vals[idx][1] in allowed_tag_list:
            new_topic_terms.append((token, topic_terms[idx][1]))
    new_topic_terms_32.append(new_topic_terms[0:k])

In [None]:
compare_topic_sets(new_topic_terms_31, new_topic_terms_32)

In [None]:
compare_topic_sets(new_topic_terms_32, new_topic_terms_31)