Modified from: wikipedia_topic_modeling notebook

For each model, create a database with columns: (1) article title, (2) url, (3) list of similar articles, (4) list of urls for similar articles, (5) list of similarity scores

These databases will be queried in the web application

Use default num_topics

In [1]:
import sys, os
from xml.dom import minidom
import nltk
import pandas as pd

from collections import defaultdict
from gensim import corpora, models, similarities



In [2]:
os.chdir('..\data')

In [3]:
def get_page_type(title):
    """
    from WikiPage.py: extract page type from article title
    """
    if 'Category:' in title:
        return 'category'
    elif 'Portal:' in title:
        return 'portal'
    elif 'List of' in title:
        return 'list'
    elif 'File:' in title:
        return 'file'
    else:
        return 'article'

def xml_to_df(xmlfile):
    """
    input: xml filename
    output: data frame with columns: article id, title, url, page_type, tokenized text
    
    filter out pages that are not articles
    """
    xmldoc = minidom.parse(xmlfile)
    idlist = xmldoc.getElementsByTagName('id')
    titlelist = xmldoc.getElementsByTagName('title')
    textlist = xmldoc.getElementsByTagName('text')
    
    titles = [title.childNodes[0].data for title in titlelist]
    urllist = ['https://en.wikipedia.org/wiki/%s' % (title.replace(' ', '_'))
              for title in titles]
    typelist = [get_page_type(title) for title in titles]
    
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    
    word_data = [(id.childNodes[0].data, title, url, page_type,
                  tokenizer.tokenize(text.childNodes[0].data.lower()))
                for id, title, url, page_type, text in zip(idlist, titles, urllist, typelist, textlist)
                if page_type == 'article']
    word_data_df = pd.DataFrame(word_data, columns=['id', 'title', 'url', 'type', 'words'])
    #word_data_df.to_csv('word_data_df.csv')
    return(word_data_df)

In [4]:
xmlfile = 'Wikipedia-dog.xml'
df1 = xml_to_df(xmlfile)
#xmlfile = 'Wikipedia-fish.xml'
#df2 = xml_to_df(xmlfile)
word_data_df = df1 #.append(df2)
#word_data_df.head()

In [None]:
word_data_df

In [None]:
def make_dictionary(documents):
    """
    construct a dictionary, i.e. mapping btwn word ids and their freq of occurence in the whole corpus
    filter dictionary to remove stopwords and words occuring < min_count times
    
    input: documents is an iterable consisting of all the words in the corpus 
    output: filtered dictionary
    """
    dictionary = corpora.Dictionary(documents)

    stop_words = nltk.corpus.stopwords.words('english') 
    min_count = 2
    stop_ids = [dictionary.token2id[word] for word in stop_words
               if word in dictionary.token2id]
    rare_ids = [id for id, freq in dictionary.dfs.items()
                if freq < min_count]
    dictionary.filter_tokens(stop_ids + rare_ids)
    dictionary.compactify()
    return(dictionary)

def make_corpus(word_data_df):
    """
    """
    documents = word_data_df['words'].values
    dictionary = make_dictionary(documents)
    # convert corpus to vectors using bag-of-words representation, i.e. tuples of word indices and word counts
    corpus = [dictionary.doc2bow(words) for words in documents]
    return(corpus, dictionary)

def make_lsi_similarity_matrix(tfidf_corpus, dictionary):
    """
    construct LSI (latent semantic indexing) model on Tfidf-transformed corpus, print model topics, 
    return similarity matrix.
    """
    # construct model
    lsi = models.lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=200) 
    lsi.save('wiki.lsi')
    for i, topic in enumerate(lsi.print_topics(5)[:3]):
        print('Topic: ', format(i))
        print(str(topic).replace(' + ', '\n')) 
        print('') 
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(lsi[tfidf_corpus], num_best=6)
    return(matsim)

def print_similar_articles(word_data_df, matsim, num_print):
    """http://localhost:8888/notebooks/CDIPS_Content_Rec/melanie/wikipedia_topic_modeling_to_db.ipynb#
    print titles of first num_print articles and their most similar articles and similarity scores.
    this is independent of model used.
    """
    titles = word_data_df['title']
    # for the first num_print articles, print most similar articles and their similarity scores
    for sims in list(matsim)[:num_print]:
        title_index = sims[0][0]
        print(titles[title_index]) 
        for other_title_index, score in sims[1:]:
            print('\t', titles[other_title_index], ' ', score) 

In [35]:
sim=list(lsi_matsim)
sim[:1]

[[(0, 0.99999988079071045),
  (54, 0.33596238493919373),
  (43, 0.27727752923965454),
  (6, 0.17250028252601624),
  (22, 0.13536316156387329),
  (34, 0.12215814739465714)]]

In [38]:
# make dataframe with (1) article title, (2) url, (3) list of similar articles, (4) list of urls for similar articles, 
# (5) list of similarity scores
titles = word_data_df[['title']]
urls = word_data_df[['url']
other_titles = [titles[title_ind] for title_ind, score in sims[1:] for sims in list(lsi_matsim)]

SyntaxError: invalid syntax (<ipython-input-38-29e259f0495a>, line 5)

In [7]:
corpus, dictionary = make_corpus(word_data_df)
tfidf = models.TfidfModel(corpus)
lsi_matsim = make_lsi_similarity_matrix(tfidf[corpus], dictionary)

Topic:  0
(0, '0.301*"cat"
0.219*"cats"
0.192*"dog"
0.155*"journal"
0.115*"meat"
0.094*"name"
0.091*"dogs"
0.090*"feral"
0.090*"doi"
0.089*"volume"')

Topic:  1
(1, '0.415*"cat"
0.345*"cafe"
-0.316*"dog"
0.250*"cats"
0.138*"café"
-0.132*"dogs"
-0.130*"breed"
-0.111*"wagging"
-0.111*"kennel"
-0.104*"breeds"')

Topic:  2
(2, '0.273*"meat"
-0.253*"bites"
0.224*"cafe"
-0.222*"bite"
-0.173*"rabies"
-0.161*"cdc"
0.144*"festival"
0.135*"china"
-0.127*"infection"
-0.114*"wagging"')



In [8]:
print_similar_articles(word_data_df, lsi_matsim, 10)

Kennel
	 Cattery   0.335962384939
	 Indian National Kennel Club   0.27727752924
	 Breed type (dog)   0.172500282526
	 Dog World (newspaper)   0.135363161564
	 Lists of dogs   0.122158147395
Cynology
	 Dog   0.143377020955
	 Breed type (dog)   0.127784788609
	 Cat training   0.124675229192
	 Pussy   0.117153279483
	 Felinology   0.108356624842
Pack (canine)
	 Canid hybrid   0.270516097546
	 Dog   0.224993467331
	 Origin of the domestic dog   0.217068359256
	 Canine reproduction   0.162874683738
	 Cat   0.104910813272
Rare breed (dog)
	 Breed type (dog)   0.329889953136
	 Lists of dogs   0.166610077024
	 Dog   0.16474558413
	 Dog bite   0.133017197251
	 Origin of the domestic dog   0.1178458184
Dogs in ancient China
	 Dogs in Mesoamerica   0.203201308846
	 Dog   0.200637221336
	 Origin of the domestic dog   0.200029179454
	 Dog meat   0.185686558485
	 Panhu   0.175255656242
Dog biscuit
	 Dog food   0.25327244401
	 Dog meat   0.155288115144
	 Dog daycare   0.128972351551
	 Cat meat   0.12

In [11]:
def make_rp_similarity_matrix(tfidf_corpus, dictionary):
    """
    construct RP (random projections) model on Tfidf-transformed corpus, print model topics, 
    return similarity matrix.
    """
    # construct model
    rp = models.RpModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=200)
    rp.save('wiki.rp_model')
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(rp[tfidf_corpus], num_best=6)
    return(matsim)

In [12]:
rp_matsim = make_rp_similarity_matrix(tfidf[corpus], dictionary)
print_similar_articles(word_data_df, rp_matsim, 10)

Kennel
	 Cattery   0.473478198051
	 Indian National Kennel Club   0.353589355946
	 Breed type (dog)   0.336074620485
	 Lists of dogs   0.215678781271
	 Dog bite   0.212287530303
Cynology
	 Dog meat   0.217460200191
	 Feline zoonosis   0.203232139349
	 Therapy cat   0.201697513461
	 Pet Check Technology   -0.145228013396
	 Rabies in Haiti   0.142572551966
Pack (canine)
	 Dogs in the American Revolutionary War   -0.173008412123
	 Dog   0.172464832664
	 Exotic felines as pets   0.167812645435
	 PDSA Certificate for Animal Bravery or Devotion   -0.164077565074
	 Canid hybrid   0.163395106792
Rare breed (dog)
	 Breed type (dog)   0.274982780218
	 Lists of dogs   0.228078782558
	 Dogs in religion   0.171975374222
	 Cat bite   0.17196701467
	 Cultural depictions of cats   0.156541839242
Dogs in ancient China
	 Origin of the domestic dog   0.270263642073
	 Dog   0.229469776154
	 Panhu   0.222705617547
	 Canid hybrid   0.198899462819
	 Dog meat   0.182176455855
Dog biscuit
	 Dog food   0.276397