In [1]:
###################################################
#  Tool to Get Q-A Pairs from     Support Page    #
###################################################
#                                                 #
#                                                 #
#        AbderRahman N. Sobh - 12/27/2018         #
#               All Rights Reserved.              #
###################################################

# Required Packages, install if anything is missing in your environment.

#!pip install gensim
#!pip install nltk
#!pip install pandas
#!pip install matplotlib
#!pip install beautifulsoup4

# Package Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.summarization.textcleaner import get_sentences

from gensim.models import TfidfModel
from gensim.models.coherencemodel import CoherenceModel

import nltk
nltk.download('maxent_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')

import urllib.request
from bs4 import BeautifulSoup

[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
#################################################
# Parse all the Q-A URLs from the Support Pages #
#################################################
def get_qapages():
    
    # Parse the support page for sub-links
    support_page = urllib.request.urlopen('URL HERE')
    soup = BeautifulSoup(support_page, 'html.parser')
    sublinks = []
    for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):
        sublinks+=[i['href']]

    # Parse sublinks for sub-sublinks
    subsublinks = []
    for pg in sublinks:
        page = urllib.request.urlopen(pg)
        soup = BeautifulSoup(page, 'html.parser')
        for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):
            subsublinks+=[i['href']]

    # Parse sub-sublinks for actual Question-Answer pages
    qapages = []
    for link in subsublinks:
        page = urllib.request.urlopen(link)
        soup = BeautifulSoup(page, 'html.parser')
        for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):
            qapages+=[i['href']]
            
    return qapages

In [3]:
#######################################################################
# Extract Question (Title) and Answer (Full Text) Pairs from each URL #
#######################################################################
def get_qapairs(qapages):
    
    qapairs = []

    for qapg in qapages:
        page = urllib.request.urlopen(qapg)
        soup = BeautifulSoup(page, 'html.parser')

        title = soup.find_all('h1', {'id' : 'title'})[0].contents[0].strip()
        fulltext = ''
        for block in soup.find_all('div', {'class' : 'mt-section'}):
            fulltext += ' {}'.format(block.text)

        qapairs += [(title, fulltext.strip(), qapg)]
        
    return qapairs

In [4]:
############################################################################################
# Extract NLP data structures from any specified text column (i.e. 'Question' or 'Answer') #
############################################################################################
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def get_NLP(df, column):

    # Extract all the data structures we need from the initial data: sentences, tokens, dictionaries, and corpuses
    df['tokens'] = df[column].apply(lambda x: tokenize(x))
    common_dictionary = gensim.corpora.Dictionary(df['tokens'])
    common_corpus = [common_dictionary.doc2bow(text) for text in df['tokens']]
    df['corpus'] = common_corpus
    df['pos_tags'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
    
    # Get a TFIDF Model and apply the scores to the words in each entry of the text column
    tmodel = TfidfModel(corpus=common_corpus)
    df['tfidf_wordscores'] = df['corpus'].apply(lambda x: [(common_dictionary.get(a),b) for (a,b) in tmodel[x]])
    
    icols = ['tokens','pos_tags', 'corpus','tfidf_wordscores']
    jcols = [item+'_'+column for item in icols]
    df = df.rename(columns=dict(zip(icols, jcols)))
    
    return df, common_dictionary, common_corpus
    

In [5]:
##################################################################
# Investigate Coherence Scores by iteratively fitting LDA models #
##################################################################

# Coherence models are used to determine roughly how many topics we expect are present in the dataset
# This is an exploratory portion of the code which requires human inference to select the best option.
# Though, it is possible to score a maximization between both metrics as well.

# Note that this portion of the code essentially creates a number of models with different topic selection counts
# and stores ALL of them as an array, allowing for easy testing of different model types on the fly.
# This is resource intensive!

def compute_coherence_values(dictionary, corpus,texts, start, limit, step, ctype):
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model= gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, random_state=7)
        model_list.append(model)
        cm = CoherenceModel(model=model, dictionary=dictionary, corpus=corpus, texts=texts, coherence=ctype)
        coherence_values.append(cm.get_coherence())
            
    return model_list, coherence_values


# Left off for the sake of time ... going off of human observation this time. See plotting function below.
#def maximize_coherence(cvals1, cvals2):

# Plots of Standard scaled coherence values, their sum. 
# Optimal topic selection should perform well across both performance metrics.
def plot_coherence(dictionary, corpus, tokens):
    # Compute u_mass and c_v coherence scores to compare results
    step = 1
    model_list1, coherence_values1 = compute_coherence_values(
        dictionary=dictionary, corpus=corpus, texts=None, start=2, limit=40, step=1, ctype='u_mass')

    model_list2, coherence_values2 = compute_coherence_values(
        dictionary=dictionary, corpus=corpus, texts=tokens, start=2, limit=40, step=1, ctype='c_v')

    # Put both metrics on the same scale for observation
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    scaler.fit(np.array(coherence_values1).reshape(-1,1))
    cvn1 = scaler.transform(np.array(coherence_values1).reshape(-1,1))
    scaler.fit(np.array(coherence_values2).reshape(-1,1))
    cvn2 = scaler.transform(np.array(coherence_values2).reshape(-1,1))


    # Plot coherence scores
    limit=40; start=2; step=1;
    x = range(start, limit, step)
    plt.figure(figsize=(10,5))
    plt.grid()
    plt.plot(x, cvn1)
    plt.xticks(np.arange(start, limit, step=step))
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score (u_mass)")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

    plt.figure(figsize=(10,5))
    plt.grid()
    plt.plot(x, cvn2)
    plt.xticks(np.arange(start, limit, step=step))
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score (c_v)")
    plt.legend(("coherence_values"), loc='best')
    plt.show()
    
    plt.figure(figsize=(10,5))
    plt.grid()
    plt.plot(x, cvn1+cvn2)
    plt.xticks(np.arange(start, limit, step=step))
    plt.xlabel("Num Topics")
    plt.ylabel("Sum of Metrics")
    plt.legend(("coherence_values"), loc='best')
    plt.show()
    

In [6]:
##############################################
# The main function which controls this tool #
##############################################

def main():
    
    # Generate the DataFrame
    qapages = get_qapages()
    qapairs = get_qapairs(qapages)
    df = pd.DataFrame(qapairs)
    df.columns = ['Question', 'Answer', 'URL']

    # Non-specific schema components
    df[''] = ''
    df['pre-processing'] = "Convert to lowercase, remove tokens < 2 chars or > 15 chars, remove stopwords (Stone, Denis, Kwantes (2010)), split on whitespace"

    # Generate the full schema and fill with values
    cdict = {}
    ccorpus = {}
    LDAmodel = {}
    model_topics = {}
    col_list = ['Question', 'Answer', '','pre-processing']
    ce = ['bow', 'pos_tags', 'keywords', 'context', 'LDA', 'topic/intent', 'topic_key']

    text_to_use = ['Question', 'Answer']    

    for entry in text_to_use:
        # Enrich with NLP features, build Corpora and Dictionaries
        df, cdict[entry], ccorpus[entry] = get_NLP(df, entry)

        # If in notebook, consider generating the plots for observation:
        #%matplotlib inline
        #print('{} LDA Coherence Plots'.format(entry))
        #plot_coherence(cdict[entry],ccorpus[entry], df['tokens_{}'.format(entry)])

        # Fit and select the best LDA Models
        # Num of topics was determined by observation, though should be replaced with a good optimization
        LDAmodel[entry] = gensim.models.ldamodel.LdaModel(
            corpus=ccorpus[entry], id2word=cdict[entry], num_topics=9, random_state=7)

        # Apply LDA to all documents
        df['LDA_TopicPresence_{}'.format(entry)] = df['corpus_{}'.format(entry)].apply(lambda x: LDAmodel[entry][x])
        # Apply a threshold to topic presence for final interpretations, here I use 10%
        df['topic/intent_{}'.format(entry)] = df['LDA_TopicPresence_{}'.format(entry)].apply(lambda x: [i for i,v in x if v > 0.1])
        model_topics[entry] = [LDAmodel[entry].show_topic(n) for n in range(0,9)]
        df['topic_key_{}'.format(entry)] = ''
        df['topic_key_{}'.format(entry)][0] = model_topics[entry]

        
        # Apply expected schema components
        df['keywords_{}'.format(entry)] = df['tfidf_wordscores_{}'.format(entry)]

        df['context_{}'.format(entry)] = df['pos_tags_{}'.format(entry)].apply(lambda x: [k for k,v in x if v == 'NN'])
        df = df.rename(columns={'tokens_{}'.format(entry):'bow_{}'.format(entry),
                               'LDA_TopicPresence_{}'.format(entry):'LDA_{}'.format(entry)})

        col_list = col_list + [i+'_{}'.format(entry) for i in ce]

    fdf = df[col_list]    
    fdf.to_csv('out.csv')
    return


In [7]:
main()