In [1]:
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('stopwords')

In [3]:
def load_data(path,file_name):
    """
    Input  : path and file_name
    Purpose: loading text file
    Output : list of paragraphs/documents and
             title(initial 100 words considred as title of document)
    """
    documents_list = []
    titles=[]
    with open( os.path.join(path, file_name) ,"r") as fin:
        for line in fin.readlines():
            text = line.strip()
            documents_list.append(text)
    print("Total Number of Documents:",len(documents_list))
    titles.append( text[0:min(len(text),100)] )
    return documents_list,titles

In [4]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [5]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

In [52]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    return lsamodel

In [9]:
document_list,titles=load_data("","../artifacts/text/corpus_all_in_one/corpus_all_in_one.txt")
clean_text=preprocess_data(document_list)

Total Number of Documents: 214315


In [53]:
number_of_topics=20
words=30000
model=create_gensim_lsa_model(clean_text,number_of_topics,words)
print('Done')

Done


# WordNet Term Expansion

In [27]:
from nltk.corpus import wordnet
from nltk.corpus.reader import Synset
import json

In [None]:
nltk.download('wordnet')

In [29]:
def get_parents(obj):
    """
    Get direct hypernyms
    """
    return obj.hypernyms() + obj.instance_hypernyms()

In [34]:
def _get_hierarchies(obj, level=-1):
    
    if level == 0:
        # stop if reached a certain number of levels.
        return [[]]
    
    parents = get_parents(obj)
    if not parents:
        return [[]]

    hierarchies = []
    for parent in parents:
        tmp = _get_hierarchies(parent, level-1)
        for hierarchy in tmp:
            hierarchy.append(parent)
        hierarchies = hierarchies + tmp
    return hierarchies


def get_hierarchies(word, level=-1):
    normalized = word.lower().replace(' ', '_')
    
    # a single word may have multiple synsets
    entries = wordnet.synsets(normalized, pos=wordnet.NOUN)
    filtered_entries = list(filter(lambda x: normalized == x.lemma_names()[0].lower(), entries))

    # only use filtered entries if something is left.
    if filtered_entries:
        entries = filtered_entries

    hierarchies = []
    for entry in entries:
        hierarchies_of_entry = _get_hierarchies(entry, level)
        for hierarchy in hierarchies_of_entry:
            hierarchy.append(entry)
            hierarchies.append(hierarchy)
    return hierarchies

In [98]:
# get hypernyms for sea within two levels
hierarchies = get_hierarchies('sea', 2)
hierarchies

[[Synset('thing.n.12'), Synset('body_of_water.n.01'), Synset('sea.n.01')],
 [Synset('flow.n.04'), Synset('turbulent_flow.n.01'), Synset('sea.n.03')]]

# Applying WordNet Expansion on words in LSA clusters

In [91]:
unique_words = set()
for i, _ in model.show_topics():
    topic = model.show_topic(i, topn=15000)
    for word, score in topic:
        unique_words.add(word)
        
print(f'number of unique words: {len(unique_words)}')

number of unique words: 33754


In [97]:
unique_words_expanded = set()
for word in unique_words:
    unique_words_expanded.add(word)
    
    # get wordnet hypernyms of two levels
    hierarchies = get_hierarchies(word, 2)
    
    if not hierarchies:
        # no expansion, append word as it is
        continue
        
    # for every item in the hierarchy except the original
    for hierarchy in hierarchies[:-1]:
        for item in hierarchy:
            for lemma_name in item.lemma_names():
                unique_words_expanded.add(lemma_name.lower().replace('_', ' '))
            

print(f'number of unique words after wordnet expansion {len(unique_words_expanded)}')

number of unique words after wordnet expansion 42603


# Compare against ontology

In [102]:
from evaluation.ontology import sparql, walk
from evaluation import ontology

In [131]:
def process_ontology(path: str, kind='OWL'):
    concept_set = set()
    g = ontology.sparql.graph_from(path)
    classes = ontology.sparql.get_classes(g, kind)
    
    for identifier, term in classes:
        concept_set.add(term.lower())
    return concept_set

In [206]:
ontology_file = 'sweet.owl'
ontology_vocabulary = process_ontology(f'../artifacts/ontologies/{ontology_file}')

In [207]:
print(len(ontology_vocabulary))

4524


In [208]:
true_positive = 0 # was in the ontology and in the corpus
false_positive = 0 # was not in the ontology, but was in the corpus
true_negative = 0  # always 0
false_negative = 0 # was in the ontology, but was not in the corpus


for corpus_word in unique_words_expanded:
    if corpus_word in ontology_vocabulary:
        # was in the ontology and in the corpus
        true_positive += 1
    else:
        # was not in the ontology, but was in the corpus
        false_positive += 1
    
for ontology_term in ontology_vocabulary:
    if ontology_term not in unique_words_expanded:
        # was in the ontology, but was not in the corpus
        false_negative += 1
        
print(f'{true_positive=}, {false_positive=}, {true_negative=}, {false_negative=}')

true_positive=1133, false_positive=41470, true_negative=0, false_negative=3279


In [209]:
precision = true_positive / (true_positive + false_positive) # fitness 
recall = true_positive / (true_positive + false_negative) # unneeded concepts

print(f'precision: {precision}, recall: {recall}')

precision: 0.02659437128840692, recall: 0.2567996373526745
