Modified from: wikipedia_topic_modeling notebook

For each model, create a database with columns: (1) article title, (2) url, (3) list of similar articles, (4) list of urls for similar articles, (5) list of similarity scores

These databases will be queried in the web application

Use default num_topics

In [1]:
import sys, os
from xml.dom import minidom
import nltk
import pandas as pd

from collections import defaultdict
from gensim import corpora, models, similarities



In [2]:
os.chdir('..\data')

In [3]:
def get_page_type(title):
    """
    from WikiPage.py: extract page type from article title
    """
    if 'Category:' in title:
        return 'category'
    elif 'Portal:' in title:
        return 'portal'
    elif 'List of' in title:
        return 'list'
    elif 'File:' in title:
        return 'file'
    else:
        return 'article'

def xml_to_df(xmlfile):
    """
    input: xml filename
    output: data frame with columns: article id, title, url, page_type, tokenized text
    
    filter out pages that are not articles
    """
    xmldoc = minidom.parse(xmlfile)
    idlist = xmldoc.getElementsByTagName('id')
    titlelist = xmldoc.getElementsByTagName('title')
    textlist = xmldoc.getElementsByTagName('text')
    
    titles = [title.childNodes[0].data for title in titlelist]
    urllist = ['https://en.wikipedia.org/wiki/%s' % (title.replace(' ', '_'))
              for title in titles]
    typelist = [get_page_type(title) for title in titles]
    
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    
    word_data = [(id.childNodes[0].data, title, url, page_type,
                  tokenizer.tokenize(text.childNodes[0].data.lower()))
                for id, title, url, page_type, text in zip(idlist, titles, urllist, typelist, textlist)
                if page_type == 'article']
    word_data_df = pd.DataFrame(word_data, columns=['id', 'title', 'url', 'type', 'words'])
    #word_data_df.to_csv('word_data_df.csv')
    return(word_data_df)

In [4]:
xmlfile = 'Wikipedia-dog.xml'
df1 = xml_to_df(xmlfile)
#xmlfile = 'Wikipedia-fish.xml'
#df2 = xml_to_df(xmlfile)
word_data_df = df1 #.append(df2)
#word_data_df.head()

In [5]:
word_data_df

Unnamed: 0,id,title,url,type,words
0,5957048,Kennel,https://en.wikipedia.org/wiki/Kennel,article,"[about, shelter, for, dogs, and, cats, for, th..."
1,729436,Cynology,https://en.wikipedia.org/wiki/Cynology,article,"[cynology, ipac, en, s, ᵻ, ˈ, n, ɒ, l, ə, dʒ, ..."
2,1764821,Pack (canine),https://en.wikipedia.org/wiki/Pack_(canine),article,"[other, uses, wolfpack, disambiguation, image,..."
3,547372987,Rare breed (dog),https://en.wikipedia.org/wiki/Rare_breed_(dog),article,"[for, a, list, of, rare, dog, breeds, category..."
4,547375119,Dogs in ancient China,https://en.wikipedia.org/wiki/Dogs_in_ancient_...,article,"[refimprove, date, december, 2008, originalres..."
5,6569922,Dog biscuit,https://en.wikipedia.org/wiki/Dog_biscuit,article,"[image, dog, biscuit, jpg, thumb, right, a, do..."
6,747942371,Breed type (dog),https://en.wikipedia.org/wiki/Breed_type_(dog),article,"[use, dmy, dates, date, july, 2013, breed, typ..."
7,1467938,Canid hybrid,https://en.wikipedia.org/wiki/Canid_hybrid,article,"[redirect2, dog, hybrid, hybrid, dog, all, dom..."
8,772563016,Canine physical therapy,https://en.wikipedia.org/wiki/Canine_physical_...,article,"[infobox, disease, name, canine, physical, the..."
9,13286072,Dogs in Mesoamerica,https://en.wikipedia.org/wiki/Dogs_in_Mesoamerica,article,"[no, footnotes, date, april, 2009, various, so..."


In [27]:
def make_dictionary(documents):
    """
    construct a dictionary, i.e. mapping btwn word ids and their freq of occurence in the whole corpus
    filter dictionary to remove stopwords and words occuring < min_count times
    
    input: documents is an iterable consisting of all the words in the corpus 
    output: filtered dictionary
    """
    dictionary = corpora.Dictionary(documents)

    stop_words = nltk.corpus.stopwords.words('english') 
    min_count = 2
    stop_ids = [dictionary.token2id[word] for word in stop_words
               if word in dictionary.token2id]
    rare_ids = [id for id, freq in dictionary.dfs.items()
                if freq < min_count]
    dictionary.filter_tokens(stop_ids + rare_ids)
    dictionary.compactify()
    return(dictionary)

def make_corpus(word_data_df):
    """
    """
    documents = word_data_df['words'].values
    dictionary = make_dictionary(documents)
    # convert corpus to vectors using bag-of-words representation, i.e. tuples of word indices and word counts
    corpus = [dictionary.doc2bow(words) for words in documents]
    return(corpus, dictionary)

def make_lsi_similarity_matrix(word_data_df, tfidf_corpus, dictionary):
    """
    construct LSI (latent semantic indexing) model on Tfidf-transformed corpus, print model topics, 
    return similarity matrix.
    """
    # construct model
    lsi = models.lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=200) 
    lsi.save('wiki.lsi')
    for i, topic in enumerate(lsi.print_topics(5)[:3]):
        print('Topic: ', format(i))
        print(str(topic).replace(' + ', '\n')) 
        print('') 
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(lsi[tfidf_corpus], num_best=6)
    
    titles = word_data_df['title']
    urls = word_data_df['url']
    # from Tonatiuh's similarity_matrix.py: save output in similarity_matrix array
    similarity_matrix = defaultdict(list)
    for title, url, sims in zip(titles, urls, matsim):
        similarity_matrix[title].append([]) # list of titles
        similarity_matrix[title].append([]) # list of urls
        similarity_matrix[title].append([]) # list of similar article titles
        similarity_matrix[title].append([]) # list of similar article urls
        similarity_matrix[title].append([]) # list of similar article scores
        similarity_matrix[title][0].append(title)
        similarity_matrix[title][1].append(url)
        for other_title_index, score in sims[1:]:
            similarity_matrix[title][2].append(titles[other_title_index])
            similarity_matrix[title][3].append(urls[other_title_index])
            similarity_matrix[title][4].append(score)    
    return(similarity_matrix, matsim)

def print_similar_articles(word_data_df, matsim, num_print):
    """http://localhost:8888/notebooks/CDIPS_Content_Rec/melanie/wikipedia_topic_modeling_to_db.ipynb#
    print titles of first num_print articles and their most similar articles and similarity scores.
    this is independent of model used.
    """
    titles = word_data_df['title']
    # for the first num_print articles, print most similar articles and their similarity scores
    for sims in list(matsim)[:num_print]:
        title_index = sims[0][0]
        print(titles[title_index]) 
        for other_title_index, score in sims[1:]:
            print('\t', titles[other_title_index], ' ', score) 

In [28]:
corpus, dictionary = make_corpus(word_data_df)
tfidf = models.TfidfModel(corpus)
lsi_similarity_matrix,lsi_matsim = make_lsi_similarity_matrix(word_data_df, tfidf[corpus], dictionary)

Topic:  0
(0, '0.301*"cat"
0.219*"cats"
0.192*"dog"
0.155*"journal"
0.115*"meat"
0.094*"name"
0.091*"dogs"
0.090*"feral"
0.090*"doi"
0.089*"volume"')

Topic:  1
(1, '-0.415*"cat"
-0.345*"cafe"
0.316*"dog"
-0.250*"cats"
-0.138*"café"
0.132*"dogs"
0.130*"breed"
0.111*"wagging"
0.111*"kennel"
0.104*"breeds"')

Topic:  2
(2, '-0.273*"meat"
0.253*"bites"
-0.224*"cafe"
0.222*"bite"
0.173*"rabies"
0.161*"cdc"
-0.144*"festival"
-0.135*"china"
0.127*"infection"
0.114*"wagging"')



In [25]:
lsi_similarity_matrix

defaultdict(list,
            {'Ailurophobia': [['Ailurophobia',
               'Ailurophobia',
               'Ailurophobia',
               'Ailurophobia',
               'Ailurophobia'],
              ['https://en.wikipedia.org/wiki/Ailurophobia',
               'https://en.wikipedia.org/wiki/Ailurophobia',
               'https://en.wikipedia.org/wiki/Ailurophobia',
               'https://en.wikipedia.org/wiki/Ailurophobia',
               'https://en.wikipedia.org/wiki/Ailurophobia'],
              ['Cynophobia',
               'Cat',
               'Human interaction with cats',
               'Cat senses',
               'Dog'],
              ['https://en.wikipedia.org/wiki/Cynophobia',
               'https://en.wikipedia.org/wiki/Cat',
               'https://en.wikipedia.org/wiki/Human_interaction_with_cats',
               'https://en.wikipedia.org/wiki/Cat_senses',
               'https://en.wikipedia.org/wiki/Dog'],
              [0.30192595720291138,
               0.204

In [29]:
# make dataframe with (1) article title, (2) url, (3) list of similar articles, (4) list of urls for similar articles, 
# (5) list of similarity scores
df_sim = pd.DataFrame.from_dict(lsi_similarity_matrix, orient='index')
df_sim.columns = ['title','url','similar_titles', 'urls', 'scores']
df_sim.head()

Unnamed: 0,title,url,similar_titles,urls,scores
Kennel,[Kennel],[https://en.wikipedia.org/wiki/Kennel],"[Cattery, Indian National Kennel Club, Breed t...","[https://en.wikipedia.org/wiki/Cattery, https:...","[0.335962384939, 0.27727752924, 0.172500282526..."
Cynology,[Cynology],[https://en.wikipedia.org/wiki/Cynology],"[Dog, Breed type (dog), Cat training, Pussy, F...","[https://en.wikipedia.org/wiki/Dog, https://en...","[0.143377020955, 0.127784788609, 0.12467522919..."
Pack (canine),[Pack (canine)],[https://en.wikipedia.org/wiki/Pack_(canine)],"[Canid hybrid, Dog, Origin of the domestic dog...","[https://en.wikipedia.org/wiki/Canid_hybrid, h...","[0.270516097546, 0.224993467331, 0.21706835925..."
Rare breed (dog),[Rare breed (dog)],[https://en.wikipedia.org/wiki/Rare_breed_(dog)],"[Breed type (dog), Lists of dogs, Dog, Dog bit...",[https://en.wikipedia.org/wiki/Breed_type_(dog...,"[0.329889953136, 0.166610077024, 0.16474558413..."
Dogs in ancient China,[Dogs in ancient China],[https://en.wikipedia.org/wiki/Dogs_in_ancient...,"[Dogs in Mesoamerica, Dog, Origin of the domes...",[https://en.wikipedia.org/wiki/Dogs_in_Mesoame...,"[0.203201308846, 0.200637221336, 0.20002917945..."


In [30]:
df_sim.to_csv('df_sim_lsi.csv')

In [None]:
print_similar_articles(word_data_df, lsi_matsim, 10)

In [None]:
def make_rp_similarity_matrix(tfidf_corpus, dictionary):
    """
    construct RP (random projections) model on Tfidf-transformed corpus, print model topics, 
    return similarity matrix.
    """
    # construct model
    rp = models.RpModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=200)
    rp.save('wiki.rp_model')
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(rp[tfidf_corpus], num_best=6)
    return(matsim)

In [None]:
rp_matsim = make_rp_similarity_matrix(tfidf[corpus], dictionary)
print_similar_articles(word_data_df, rp_matsim, 10)