In [80]:
import os
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.chunk import RegexpParser

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

import collections

import wikipedia
import json
import random

# Extracting keywords with their abbreviations

In [30]:
df = pd.read_csv("bprml_updated.csv", encoding="utf-8")

In [31]:
parent_class = df[["Parent Class"]]
sub_class = df[["Sub Class"]]
reference = df[["Reference"]]
index_length = df.shape[0]

In [32]:
print(index_length)

780


In [33]:
relevant_topics = []
topics_abbr = []

different_terms = []

for i in range(index_length):
    x1 = parent_class.iloc[i].values[0]
    x2 = sub_class.iloc[i].values[0]
    x3 = reference.iloc[i].values[0]
    if x1 == x2:
        relevant_topics.append(x1)
        if x1 != x3:
            topics_abbr.append(x3)
        else: topics_abbr.append("")
    else:
        terms = [x1, x2]
        different_terms.append(terms)

In [35]:
print(len(topics_abbr))

647


# Keyphrase Extraction from Book Chapters

In [38]:
def extract_candidate_keywords(chunkGram, text):
    chunkParser = nltk.RegexpParser(chunkGram)
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    chunked = chunkParser.parse(tagged)
    candidate_keywords = []

    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            candidate_keyword = ' '.join([x for x,y in tree.leaves()])
            candidate_keywords.append(candidate_keyword)
    
    return candidate_keywords

def clean_phrase(phrase):
    new_string = ""
    for word in phrase.split(" "):
        word = porter.stem(word)
        new_string += word + " "
    return new_string[:-1]


porter = PorterStemmer()
lancaster = LancasterStemmer()

chapter_file = "chapter_text/bprml.txt"

raw = open(chapter_file).read()

max_length = 6

chunkGram1 = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}
        
    PHRASE:
        {<NBAR>}
        {<NBAR><IN><NBAR>}
"""

chunkGram2 = r""" PHRASE: 
                {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}
            """


candidate_keywords = extract_candidate_keywords(chunkGram1, raw)

candidate_keywords = [word.lower() for word in candidate_keywords]

candidate_keywords = set(candidate_keywords)
candidate_keywords = [w for w in candidate_keywords if len(w.split(' ')) < max_length]

print(len(candidate_keywords))

filtered_keywords = [clean_phrase(w) for w in candidate_keywords]

filtered_keywords = set(filtered_keywords)

print(len(filtered_keywords))

17242
16025


# Matching Parent and Sub Class

In [39]:
def find_best_string(all_strings):
    return max(set(all_strings), key = all_strings.count)


def matching_pattern(keyword1, keyword2, phrase):
#     phrase = clean_phrase(phrase)
    stem1 = clean_phrase(keyword1)
    stem2 = clean_phrase(keyword2)
    if stem1 in phrase and stem2 in phrase:
        len1 = phrase.index(stem1)
        len2 = phrase.index(stem2)
        
        if len1 > len2:
            start = len2 + len(stem2)
            end = len1
            string = keyword2 + phrase[start:end] + keyword1
        else:
            start = len1 + len(stem1)
            end = len2
            string = keyword1 + phrase[start:end] + keyword2
        return 1, string
    else: return 0, ""

count = 0
successful_phrases = []
unsuccessful_phrases = []


for pair in different_terms:
    match_count = 0
    possible_strings = []
    for phrase in filtered_keywords:
        result, string = matching_pattern(pair[0], pair[1], phrase)
        if result != 0:
            possible_strings.append(string)
            match_count += 1
    if match_count > 0:
        main_keyword = find_best_string(possible_strings)
        successful_phrases.append(main_keyword)
        count += 1
        print(main_keyword)
    else:
        unsuccessful_phrases.append(pair)


between-class covariance
within-class covariance
partitioned covariance matrix
conditional entropy
differential entropy
relative entropy
functional derivative
conditional gaussian
gaussian marginal
gaussian mixture
directed graphical model
undirected graphical model
autoregressive hidden markov model
factorial hidden markov model
input-output hidden markov model
left-to-right hidden markov model
extended kalman ﬁlter
gaussian kernel function
fisher linear discriminant
linear regression problem
variational linear regression
bayesian logistic regression
logistic regression mixture model
multiclass logistic regression
margin error
soft margin
homogeneous markov chain
message passing schedule
variational message passing
conditional mixture model
logistic regression mixture model
convolutional neural network
perceptron convergence theorem
perceptron hardware
conjugate prior
consistent gaussian prior
improper prior
noninformative prior
bayesian probability
probability density
probability mas

In [40]:
print(len(successful_phrases))
print(len(unsuccessful_phrases))

45
88


In [42]:
combine_phrases = []
for phrase_pair in unsuccessful_phrases:
    phrase = phrase_pair[1] + " " + phrase_pair[0]
    combine_phrases.append(phrase)

In [45]:
for x in successful_phrases:
    relevant_topics.append(x)
    topics_abbr.append("")

for x in combine_phrases:
    relevant_topics.append(x)
    topics_abbr.append("")

In [49]:
for i in range(len(relevant_topics)):
    print(relevant_topics[i], " ", topics_abbr[i])

acceptance criterion   
activation function   
active constraint   
adaboost   
adaline   
adaptive rejection sampling   
assumed density ﬁltering   
akaike information criterion   aic
akaike information criterion   
ancestral sampling   
annular ﬂow   
autoregressive model   ar model
arc   
automatic relevance determination   ard
autoregressive moving average   arma
assumed density ﬁltering   
autoassociative networks   
automatic relevance determination   
autoregressive hidden markov model   
autoregressive model   
autoregressive moving average   
back-tracking   
backgammon   
backpropagation   
bagging   
basis function   
batch training   
baum-welch algorithm   
bayes’ theorem   
bayes   
bayesian analysis   
bayesian information criterion   
bayesian model comparison   
bayesian network   
bayesian probability   
belief propagation   
bernoulli distribution   
bernoulli   
beta distribution   
beta recursion   
between-class covariance   
bias   
bias parameter   
bias-varianc

# Following are functions for extracting data

In [138]:
def contain_section(line):
    line = line.strip()
    if len(line) > 4:
        if line[0] == "=" and line[1] == "=" and line[-2] == "=" and line[-1] == "=":
            return True
        else:
            return False

def wiki_section_extract(content):
    lines = content.split("\n")
    sections = ""
    for line in lines:
        if contain_section(line):
            sections += line[3:-3] + "\n"
    return sections.strip()

def keyword_data(topic = "", abbr = "", wiki_title = "", wiki_summary = "",
                 wiki_content = "", wiki_html = "", wiki_links = "", wiki_sections = ""):
    data = {
        'topic': topic,
        "abbreviation": abbr,
        "wiki_title": wiki_title,
        "wiki_summary": wiki_summary,
        "wiki_content": wiki_content,
        "wiki_html": wiki_html,
        "wiki_links": wiki_links,
        "wiki_sections": wiki_sections
    }
    return data


def extract_data(topic, abbr = ""):
    wiki_title = ""
    wiki_summary = ""
    wiki_content = ""
    wiki_html = ""
    wiki_links = ""
    wiki_sections = ""
    try:
        wiki = wikipedia.search(topic)[0]        
        try:
            wiki_data = wikipedia.page(topic)
            wiki_summary = wiki_data.summary
            wiki_content = wiki_data.content
            wiki_html = wiki_data.html()
            wiki_links = wiki_data.links
            wiki_sections = wiki_section_extract(wiki_content)

        except wikipedia.exceptions.DisambiguationError as e:
            print("blank")
        except wikipedia.exceptions.PageError as e:
            print("blank")

    except IndexError:
        print("blank")
    
    
    data = keyword_data(topic, abbr, wiki_title, wiki_summary,
                        wiki_content, wiki_html, wiki_links, wiki_sections)

    return data


In [134]:
data = extract_data(relevant_topics[70], topics_abbr[70])

In [135]:
all_keyword_data[70] = data

# Relevant topics, Topics abbr and all functions are all defined till this point

In [123]:
list_len = len(relevant_topics)
all_keyword_data = {}

In [141]:
complete = 577
for i in range(list_len - complete):
    i += complete
    data = extract_data(relevant_topics[i], topics_abbr[i])
    all_keyword_data[i] = data
    print(i)

577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600




  lis = BeautifulSoup(html).find_all('li')


blank
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
blank
616
blank
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
blank
635
636
637
638
639
640
641
642
643
644
645
blank
646
647
blank
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
blank
670
blank
671
blank
672
673
674
675
676
677
678
blank
679
680
681
blank
682
683
blank
684
685
686
687
688
blank
689
690
691
692
693
694
695
696
697
698
699
blank
700
701
702
703
704
705
blank
706
blank
707
708
709
710
711
712
713
714
715
blank
716
717
718
719
720
721
722
723
724
725
726
727
728
729
blank
730
731
732
733
734
735
736
737
738
739
740
741
blank
742
743
744
745
746
747
748
blank
749
750
751
752
753
754
755
756
blank
757
758
759
760
761
762
blank
763
764
blank
765
blank
766
blank
767
768
769
770
771
772
773
774
blank
775
blank
776
blank
777
778
blank
779


In [142]:
print(len(all_keyword_data))

780


## Saving data in JSON Format

In [143]:
with open('topic_data.json', 'w') as file:
    json.dump(all_keyword_data, file)

## Saving data in CSV format

In [144]:
df = pd.DataFrame(columns=['topic', 'abbreviation', 'wiki_title', 'wiki_summary', 'wiki_content', 'wiki_html', 'wiki_links', 'wiki_sections'])

for i in range(len(all_keyword_data)):
    df = df.append(all_keyword_data[i], ignore_index=True)

df.to_csv("topic_data.csv")