In [1]:
import os
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.chunk import RegexpParser

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

import collections

In [2]:
index_file = "index_text/bprml.txt"
chapter_file = "chapter_text/bprml.txt"

# index_file = "index_text/iandl.txt"
# chapter_file = "chapter_text/iandl.txt"

# Following functions are use for extracting text file data

In [3]:
def toggle_state(class_indexing):
    if class_indexing: class_indexing = False
    else: class_indexing = True
    return class_indexing
   

def line_split(line):
    tokens = line.split(",")
    tokens_length = len(tokens)
    keyword = []
    keyword.append(tokens[0].strip())
    for i in range(tokens_length - 1):
        token = tokens[i+1].strip()
        if "see" in token:
            keyword.append(token[4:])
    return keyword


def data_append(parent_class, keyword):
    size = len(keyword)
    sub_class = keyword[0]
    if size > 1: reference = keyword[1]
    else: reference = ""
    data = {
        'Parent Class': parent_class,
        'Sub Class': sub_class,
        'Reference': reference
    }
    return (data)


def line_operation(line, parent_class, class_indexing):
    keyword = line_split(line)
    if class_indexing:
        data = data_append(parent_class, keyword)
    else:
        parent_class = keyword[0]
        data = data_append(parent_class, keyword)
    return (parent_class, data)

    
def data_extraction(line, parent_class, class_indexing):
    line = line.lower()
    
    if line[0] == ',' or (line[0] >= '0' and line[0] <= '9'):
        return (0, parent_class, class_indexing, "")
    
    elif line[0] == '\n':
        class_indexing = toggle_state(class_indexing)
        return (0, parent_class, class_indexing, "")
    
    else:
        parent_class, data = line_operation(line, parent_class, class_indexing)
        return (1, parent_class, class_indexing, data)

def print_processed_data(df):
    parent_class = df[["Parent Class"]]
    sub_class = df[["Sub Class"]]
    reference = df[["Reference"]]

    for i in range(df.shape[0]):
        if parent_class.iloc[i].values[0] == sub_class.iloc[i].values[0]:
            print(sub_class.iloc[i].values[0], ",", reference.iloc[i].values[0])
        else:
            print("\t", sub_class.iloc[i].values[0], ",", reference.iloc[i].values[0])

# Data extraction from index file

In [52]:
def extract_index_data(file_location, df):
#     df = pd.DataFrame(columns=['Parent Class', 'Sub Class', 'Reference'])
    parent_class = ""
    class_indexing = False
    
    file = open(file_location)
    for line in file:
        state, parent_class, class_indexing, data = data_extraction(line, parent_class, class_indexing)
        if state:
            df = df.append(data, ignore_index=True)
    
    save_file = file_location.split("/")[-1].split(".")[0] + ".csv"
    df.to_csv(save_file)
    
    file.close()
    
    return df

In [53]:
df = pd.DataFrame(columns=['Parent Class', 'Sub Class', 'Reference'])
df = extract_index_data(index_file, df)

# maximum length of keyword

In [6]:
parent_class = df[["Parent Class"]]
sub_class = df[["Sub Class"]]
index_length = df.shape[0]

In [8]:
max_length = 1

def cal_max_length(n1, n2):
    if n1 > n2: return n1
    else: return n2

for i in range(index_length):
    x1 = parent_class.iloc[i].values[0]
    x2 = sub_class.iloc[i].values[0]
    if x1 == x2:
        max_length = cal_max_length(max_length, len(x2.split(" ")))
    else:
        max_length = cal_max_length(max_length, len(x2.split(" ")) + len(x1.split(" ")))

In [54]:
max_length

6

# keywords extraction from CSV files

In [10]:
singular_terms = []
different_terms = []

for i in range(index_length):
    x1 = parent_class.iloc[i].values[0]
    x2 = sub_class.iloc[i].values[0]
    if x1 == x2:
        singular_terms.append(x2)
    else:
        terms = [x1, x2]
        different_terms.append(terms)

print(len(singular_terms))
print(len(different_terms))

648
133


# Keyphrase Extraction from Book Chapters

In [11]:
def extract_candidate_keywords(chunkGram, text):
    chunkParser = nltk.RegexpParser(chunkGram)
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    chunked = chunkParser.parse(tagged)
    candidate_keywords = []

    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            candidate_keyword = ' '.join([x for x,y in tree.leaves()])
            candidate_keywords.append(candidate_keyword)
    
    return candidate_keywords

# Useful Data

In [12]:
def clean_phrase(phrase):
    new_string = ""
    for word in phrase.split(" "):
        word = porter.stem(word)
        new_string += word + " "
    return new_string[:-1]

In [13]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

raw = open(chapter_file).read()

chunkGram1 = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}
        
    PHRASE:
        {<NBAR>}
        {<NBAR><IN><NBAR>}
"""

chunkGram2 = r""" PHRASE: 
                {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}
            """

In [14]:
candidate_keywords = extract_candidate_keywords(chunkGram1, raw)

In [15]:
candidate_keywords = [word.lower() for word in candidate_keywords]

In [16]:
f = open("candidate_keyword.txt", 'w+')

for word in candidate_keywords:
    f.write(word)
    f.write("\n")

f.close()

In [17]:
candidate_keywords = set(candidate_keywords)
candidate_keywords = [w for w in candidate_keywords if len(w.split(' ')) < max_length]

print(len(candidate_keywords))

17242


In [18]:
filtered_keywords = [clean_phrase(w) for w in candidate_keywords]

filtered_keywords = set(filtered_keywords)

print(len(filtered_keywords))

16025


# Matching Singular Keywords from Extracted List

In [19]:
matched_words = []
unmatched_words = []
for w in singular_terms:
    count = 0
    for w1 in filtered_keywords:
        if clean_phrase(w) in w1:
            count += 1
    if count == 0:
        unmatched_words.append(w)
    else:
        matched_words.append(w)

In [20]:
print("matched_words: ", len(set(matched_words)))
print("not matched: ", len(unmatched_words))

matched_words:  524
not matched:  124


In [21]:
print(matched_words)

['acceptance criterion', 'activation function', 'active constraint', 'adaboost', 'adaline', 'adaptive rejection sampling', 'adf', 'aic', 'akaike information criterion', 'ancestral sampling', 'ar model', 'arc', 'ard', 'arma', 'automatic relevance determination', 'autoregressive hidden markov model', 'back-tracking', 'backgammon', 'backpropagation', 'bagging', 'basis function', 'baum-welch algorithm', 'bayes', 'bayesian analysis', 'bayesian information criterion', 'bayesian model comparison', 'bayesian network', 'bayesian probability', 'belief propagation', 'bernoulli distribution', 'bernoulli', 'beta distribution', 'beta recursion', 'between-class covariance', 'bias', 'bias parameter', 'bias-variance trade-off', 'bic', 'binary entropy', 'binomial distribution', 'biological sequence', 'bits', 'boltzmann distribution', 'boltzmann', 'boolean logic', 'boosting', 'bootstrap', 'bootstrap ﬁlter', 'box constraints', 'box-muller method', 'canonical link function', 'cart', 'cauchy distribution', 

# Matching Parent and Sub Class

In [22]:
def find_best_string(all_strings):
    return max(set(all_strings), key = all_strings.count)


def matching_pattern(keyword1, keyword2, phrase):
#     phrase = clean_phrase(phrase)
    stem1 = clean_phrase(keyword1)
    stem2 = clean_phrase(keyword2)
    if stem1 in phrase and stem2 in phrase:
        len1 = phrase.index(stem1)
        len2 = phrase.index(stem2)
        
        if len1 > len2:
            start = len2 + len(stem2)
            end = len1
            string = keyword2 + phrase[start:end] + keyword1
        else:
            start = len1 + len(stem1)
            end = len2
            string = keyword1 + phrase[start:end] + keyword2
        return 1, string
    else: return 0, ""

count = 0
successful_phrases = []
unsuccessful_phrases = []


for pair in different_terms:
    match_count = 0
    possible_strings = []
    for phrase in filtered_keywords:
        result, string = matching_pattern(pair[0], pair[1], phrase)
        if result != 0:
            possible_strings.append(string)
            match_count += 1
    if match_count > 0:
        main_keyword = find_best_string(possible_strings)
        successful_phrases.append(main_keyword)
        count += 1
        print(main_keyword)
    else:
        unsuccessful_phrases.append(pair)


between-class covariance
within-class covariance
partitioned covariance matrix
conditional entropy
differential entropy
relative entropy
functional derivative
conditional gaussian
gaussian marginal
gaussian mixture
directed graphical model
undirected graphical model
autoregressive hidden markov model
factorial hidden markov model
input-output hidden markov model
left-to-right hidden markov model
extended kalman ﬁlter
gaussian kernel function
fisher linear discriminant
linear regression problem
variational linear regression
bayesian logistic regression
logistic regression mixture model
multiclass logistic regression
margin error
soft margin
homogeneous markov chain
message passing schedule
variational message passing
conditional mixture model
logistic regression mixture model
neural network input imag convolutional
perceptron convergence theorem
perceptron hardware
conjugate prior
consistent gaussian prior
improper prior
noninformative prior
bayesian probability
probability density
prob

In [23]:
print(len(successful_phrases))
print(len(unsuccessful_phrases))

45
88


In [27]:
for x in unsuccessful_phrases:
    print(x)

['bayesian analysis', 'hierarchical']
['bayesian analysis', 'model averaging']
['bernoulli distribution', 'mixture model']
['covariance matrix', 'diagonal']
['covariance matrix', 'isotropic']
['covariance matrix', 'positive deﬁnite']
['expectation maximization', 'gaussian mixture']
['expectation maximization', 'generalized']
['expectation maximization', 'sampling methods']
['factor analysis', 'mixture model']
['gaussian', 'maximum likelihood']
['gaussian', 'sequential estimation']
['gaussian', 'sufﬁcient statistics']
['gaussian', 'wrapped']
['generative topographic mapping', 'directional curvature']
['generative topographic mapping', 'magniﬁcation factor']
['gibbs sampling', 'blocking']
['graphical model', 'bipartite']
['graphical model', 'factorization']
['graphical model', 'fully connected']
['graphical model', 'inference']
['graphical model', 'tree']
['graphical model', 'treewidth']
['graphical model', 'triangulated']
['hessian matrix', 'diagonal approximation']
['hessian matrix', '

# Working with Wikipedia Data

In [26]:
import wikipedia

In [32]:
df = pd.DataFrame(columns=['Parent Class', 'Sub Class', 'Reference'])
df = extract_index_data(index_file, df)

parent_class = df[["Parent Class"]]
sub_class = df[["Sub Class"]]
reference = df[["Reference"]]
index_length = df.shape[0]

In [None]:
def find_abb_acr(x1, x2):
    if (len(x1.split(" ")) == 1 and len(x2.split(" ")) > 1):
        small = x1
        large = x2
        return (small, large, 1)
    elif (len(x1.split(" ")) > 1 and len(x2.split(" ")) == 1):
        small = x2
        large = x1
        return (small, large, 1)
    else:
        return ("", "", 0)

def abbr_matching(x1, x2):
    small, large, state = find_abb_acr(x1, x2)
    if state == 0:
        return ("", 0)

In [None]:
singular_terms = []
different_terms = []

for i in range(index_length):
    x1 = parent_class.iloc[i].values[0]
    x2 = sub_class.iloc[i].values[0]
    x3 = reference.iloc[i].values[0]
    if x1 == x2:
        if x3!= "":
            x1 = abbr_matching(x1, x3)
        singular_terms.append(x1)
    else:
        terms = [x1, x2]
        different_terms.append(terms)

In [None]:
print(len(singular_terms))
print(len(different_terms))

In [42]:
print(len(singular_terms))

648


In [44]:
def clean_phrase(phrase):
    phrase = phrase.replace("-", " ")
    phrase = phrase.lower()
    new_string = ""
    for word in phrase.split(" "):
        word = porter.stem(word)
        new_string += word + " "
    return new_string[:-1]

def match_phrase(phrase1, phrase2):
    phrase1 = clean_phrase(phrase1)
    phrase2 = clean_phrase(phrase2)
    if phrase1 in phrase2:
        return True
    else:
        return False

In [51]:
wikipedia.search("laplace approximation")

["Laplace's method",
 'Pierre-Simon Laplace',
 "Stirling's approximation",
 'Heaviside step function',
 'LaplacesDemon',
 'Least squares',
 'Finite difference method',
 'Marginal likelihood',
 'Discrete Laplace operator',
 'Binomial distribution']

In [45]:
phrase_wiki = []
phrase_non_wiki = []

In [46]:
for phrase in singular_terms:
    wiki_search = wikipedia.search(phrase)
    match = match_phrase(phrase, wiki_search[0])
    if match:
        phrase_wiki.append(phrase)
    else:
        phrase_non_wiki.append(phrase)
        print(phrase)

acceptance criterion
active constraint
adaptive rejection sampling
aic
ancestral sampling
annular ﬂow
ar model
assumed density ﬁltering
autoassociative networks
automatic relevance determination
autoregressive hidden markov model
autoregressive moving average
back-tracking
batch training
baum-welch algorithm
bayes’ theorem
bayesian analysis
bayesian model comparison
beta recursion
between-class covariance
bias parameter
bias-variance trade-off
biological sequence
blind source separation
blocked path
boolean logic
bootstrap ﬁlter
box constraints
box-muller method
canonical correlation analysis
canonical link function
central differences
chain graph
chapman-kolmogorov equations
child node
circular normal
classical probability
classiﬁcation
classiﬁcation and regression trees
clutter problem
co-parents
code-book vectors
combining models
complete data set
conditional mixture model
convex duality
correlation matrix
cox’s axioms
credit assignment
cross-entropy error function
curve ﬁtting
dags

KeyboardInterrupt: 

In [40]:
print(clean_phrase("akaike information criterion"))

akaik inform criterion


In [37]:
print(wikipedia.search("akaike information criterion"))

['Akaike information criterion', 'Bayesian information criterion', 'Watanabe–Akaike information criterion', 'Hirotugu Akaike', 'Deviance information criterion', 'Stepwise regression', 'Hannan–Quinn information criterion', 'Focused information criterion', 'Model selection', "Mallows's Cp"]


In [30]:
for phrase in singular_terms:
    results = wikipedia.search(phrase)
    if phrase != results[0]:
        print(phrase)

acceptance criterion
activation function
active constraint
adaboost
adaline
adaptive rejection sampling
adf
aic
akaike information criterion
ancestral sampling
annular ﬂow
ar model
arc
ard
arma
assumed density ﬁltering
autoassociative networks
automatic relevance determination
autoregressive hidden markov model
autoregressive model
autoregressive moving average
back-tracking
backgammon
backpropagation
bagging
basis function
batch training
baum-welch algorithm
bayes’ theorem
bayes
bayesian analysis
bayesian information criterion
bayesian model comparison
bayesian network
bayesian probability
belief propagation
bernoulli distribution
bernoulli
beta distribution
beta recursion
between-class covariance
bias
bias parameter
bias-variance trade-off
bic
binary entropy
binomial distribution
biological sequence
bipartite graph
bits
blind source separation
blocked path
boltzmann distribution
boltzmann
boolean logic
boosting
bootstrap
bootstrap ﬁlter
box constraints
box-muller method
calculus of v

KeyboardInterrupt: 