In [1]:
%load_ext autotime



In [2]:

from rdflib import Graph, URIRef, Literal, OWL, RDF, RDFS, SKOS
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
# ew = pd.ExcelWriter('FIBO_topic_model.xlsx')

In [4]:
g = Graph(base=URIRef('https://www.industrialsemantics.com/fibo'))
g.bind('topics', URIRef('https://www.industrialsemantics.com/fibo/topics#'))
g.bind('inst', URIRef('https://www.industrialsemantics.com/fibo/topics/instance#'))
g.bind('class', URIRef('https://www.industrialsemantics.com/fibo/topics/class#'))
g.bind('owl', OWL)
g.bind('rdf', RDF)
g.bind('rdfs', RDFS)
g.bind('skos', SKOS)

In [5]:
documents = pd.read_csv('corpus.csv')
documents.dropna(inplace=True)

In [6]:
# Use CountVectorizor to find three letter tokens, remove stop_words,
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english',
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
X = vect.fit_transform(documents.text)

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


# Use the gensim.models.ldamodel.LdaModel constructor to estimate
# LDA model parameters on the corpus, and save to the variable `ldamodel`

ldamodel = gensim.models.LdaMulticore(corpus=corpus, id2word=id_map, passes=2,
                                               random_state=5, num_topics=99, workers=16)

In [7]:
ew = pd.ExcelWriter('FIBO-topic-map.xlsx')

In [8]:
topic_list = list()

for idx, topic in ldamodel.print_topics(-1):
#    print("Topic: {} \nWords: {}".format(idx, topic))
#    print("\n")
    topic_list.append((idx, topic))
    iri = URIRef('https://www.industrialsemantics.com/fibo/topics/class#Topic{i:06d}'.format(i=idx))
    g.add((iri, RDF.type, OWL.Class))
    g.add((iri, RDFS.label, Literal('{i:06d}'.format(i=idx))))
    g.add((iri, RDFS.comment, Literal(topic)))


topic_list_df = pd.DataFrame(topic_list, columns=['Topic_Index',
                                                  'Words']).set_index(['Topic_Index'])
topic_list_df.to_excel(ew, sheet_name='Topic List')


In [9]:
def topic_distribution(string_input):
    string_input = [string_input]
    # Fit and transform
    X = vect.transform(string_input)

    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

    output = list(ldamodel[corpus])[0]

    return output

In [10]:
def topic_prediction(my_document):
    string_input = [my_document]
    X = vect.transform(string_input)
    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)
    output = list(ldamodel[corpus])[0]
    topics = sorted(output,key=lambda x:x[1],reverse=True)
    return topics[0][0]

In [11]:
topics = list()
for i, r in documents.iterrows():
    topics.append((r.iri, r.text, topic_prediction(r.text), topic_distribution(r.text) ))
    topic_iri = URIRef('https://www.industrialsemantics.com/fibo/topics/class#Topic{i:06d}'.format(i=topic_prediction(r.text)))
    g.add((URIRef(r.iri), RDF.type, topic_iri))
    g.add((URIRef(r.iri), RDFS.label, Literal(r.text)))
    g.add((URIRef(r.iri), RDFS.comment, Literal(topic_distribution(r.text))))
    #
    list_text = Literal(topic_distribution(r.text))[1:-1].replace(',','')
    u = '''
insert data {{ <{subject}> <{predicate}> ({list}) }}
'''.format(subject=r.iri,
           predicate=RDF.value,
           list=list_text)
    #print(u)
    g.update(u)
    #
topics_df = pd.DataFrame(topics,
                         columns=['IRI',
                                  'Text',
                                  'Predicated Topic',
                                  'Topic Distribution']).set_index(['IRI'])
# remove number only triples
topics_df = topics_df[~topics_df.Text.str.contains('^[0-9]+$')]

In [12]:
g.serialize('FIBO_topic_map.ttl', format='ttl')


In [13]:
topics_df = topics_df.applymap(lambda x: x.encode('unicode_escape').
                 decode('utf-8') if isinstance(x, str) else x)

In [14]:
topics_df.to_excel(ew, sheet_name='Topic Map')
ew.close()

In [15]:
print('Done')

Done


In [16]:
topics_df.to_csv('temp.csv')