In [1]:
import os
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.chunk import RegexpParser

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

import collections

import wikipedia
import json
import random

# Extracting keywords with their abbreviations

In [23]:
df = pd.read_csv("required_data/prml_index_terms.csv", encoding="utf-8")

In [24]:
parent_class = df[["Parent Class"]]
sub_class = df[["Sub Class"]]
reference = df[["Reference"]]
index_length = df.shape[0]

In [25]:
print(index_length)

780


In [26]:
relevant_topics = []
topics_abbr = []

different_terms = []

for i in range(index_length):
    x1 = parent_class.iloc[i].values[0]
    x2 = sub_class.iloc[i].values[0]
    x3 = reference.iloc[i].values[0]
    if x1 == x2:
        relevant_topics.append(x1)
        if x1 != x3:
            topics_abbr.append(x3)
        else: topics_abbr.append("")
    else:
        terms = [x1, x2]
        different_terms.append(terms)

In [27]:
print(len(topics_abbr))

647


# Keyphrase Extraction from Book Chapters

In [7]:
def extract_candidate_keywords(chunkGram, text):
    chunkParser = nltk.RegexpParser(chunkGram)
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    chunked = chunkParser.parse(tagged)
    candidate_keywords = []

    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            candidate_keyword = ' '.join([x for x,y in tree.leaves()])
            candidate_keywords.append(candidate_keyword)
    
    return candidate_keywords

def clean_phrase(phrase):
    new_string = ""
    for word in phrase.split(" "):
        word = porter.stem(word)
        new_string += word + " "
    return new_string[:-1]


porter = PorterStemmer()
lancaster = LancasterStemmer()

chapter_file = "required_data/prml_booktext.txt"

raw = open(chapter_file).read()

max_length = 6

chunkGram1 = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}
        
    PHRASE:
        {<NBAR>}
        {<NBAR><IN><NBAR>}
"""

chunkGram2 = r""" PHRASE: 
                {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}
            """


candidate_keywords = extract_candidate_keywords(chunkGram1, raw)

candidate_keywords = [word.lower() for word in candidate_keywords]

candidate_keywords = set(candidate_keywords)
candidate_keywords = [w for w in candidate_keywords if len(w.split(' ')) < max_length]

print(len(candidate_keywords))

filtered_keywords = [clean_phrase(w) for w in candidate_keywords]

filtered_keywords = set(filtered_keywords)

print(len(filtered_keywords))

17241
16036


# Matching Parent and Sub Class

In [28]:
def find_best_string(all_strings):
    return max(set(all_strings), key = all_strings.count)


def matching_pattern(keyword1, keyword2, phrase):
#     phrase = clean_phrase(phrase)
    stem1 = clean_phrase(keyword1)
    stem2 = clean_phrase(keyword2)
    if stem1 in phrase and stem2 in phrase:
        len1 = phrase.index(stem1)
        len2 = phrase.index(stem2)
        
        if len1 > len2:
            start = len2 + len(stem2)
            end = len1
            string = keyword2 + phrase[start:end] + keyword1
        else:
            start = len1 + len(stem1)
            end = len2
            string = keyword1 + phrase[start:end] + keyword2
        return 1, string
    else: return 0, ""

count = 0
successful_phrases = []
unsuccessful_phrases = []


for pair in different_terms:
    match_count = 0
    possible_strings = []
    for phrase in filtered_keywords:
        result, string = matching_pattern(pair[0], pair[1], phrase)
        if result != 0:
            possible_strings.append(string)
            match_count += 1
    if match_count > 0:
        main_keyword = find_best_string(possible_strings)
        successful_phrases.append(main_keyword)
        count += 1
        print(main_keyword)
    else:
        unsuccessful_phrases.append(pair)


between-class covariance
within-class covariance
partitioned covariance matrix
conditional entropy
differential entropy
relative entropy
functional derivative
conditional gaussian
gaussian marginal
gaussian mixture
directed graphical model
undirected graphical model
autoregressive hidden markov model
factorial hidden markov model
input-output hidden markov model
left-to-right hidden markov model
gaussian kernel function
fisher linear discriminant
variational linear regression
bayesian logistic regression
multiclass logistic regression
margin error
soft margin
homogeneous markov chain
message passing schedule
variational message passing
conditional mixture model
convolutional neural network
perceptron convergence theorem
perceptron hardware
conjugate prior
consistent gaussian prior
improper prior
noninformative prior
bayesian probability
probability density
probability mass function
prior probability
probability sum rule
probability theory
tikhonov regularization


In [30]:
print(len(successful_phrases))
print(len(unsuccessful_phrases))

41
92


In [31]:
combine_phrases = []
for phrase_pair in unsuccessful_phrases:
    phrase = phrase_pair[1] + " " + phrase_pair[0]
    combine_phrases.append(phrase)

In [32]:
for x in successful_phrases:
    relevant_topics.append(x)
    topics_abbr.append("")

for x in combine_phrases:
    relevant_topics.append(x)
    topics_abbr.append("")

In [15]:
for i in range(len(relevant_topics)):
    print(relevant_topics[i], " ", topics_abbr[i])

acceptance criterion   
activation function   
active constraint   
adaboost   
adaline   
adaptive rejection sampling   
assumed density ﬁltering   
akaike information criterion   aic
akaike information criterion   
ancestral sampling   
annular ﬂow   
autoregressive model   ar model
arc   
automatic relevance determination   ard
autoregressive moving average   arma
assumed density ﬁltering   
autoassociative networks   
automatic relevance determination   
autoregressive hidden markov model   
autoregressive model   
autoregressive moving average   
back-tracking   
backgammon   
backpropagation   
bagging   
basis function   
batch training   
baum-welch algorithm   
bayes’ theorem   
bayes   
bayesian analysis   
bayesian information criterion   
bayesian model comparison   
bayesian network   
bayesian probability   
belief propagation   
bernoulli distribution   
bernoulli   
beta distribution   
beta recursion   
between-class covariance   
bias   
bias parameter   
bias-varianc

## Saving Extracted Relevant Keywords

In [34]:
print(len(relevant_topics))

780


In [35]:
main_keywords = {}

for i in range(len(relevant_topics)):
    main_keywords[i] = {
        "keyword": relevant_topics[i],
        "abbreviation": topics_abbr[i]
    }

In [37]:
df = pd.DataFrame(columns=['keyword', 'abbreviation'])

for i in range(len(main_keywords)):
    df = df.append(main_keywords[i], ignore_index=True)

df.to_csv("output_data/keyword_list.csv")