In [282]:
import os
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.chunk import RegexpParser

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

import collections

In [301]:
# index_file = "index_text/bprml.txt"
# chapter_file = "chapter_text/bprml.txt"

index_file = "index_text/iandl.txt"
chapter_file = "chapter_text/iandl.txt"

# Following functions are use for extracting text file data

In [284]:
def toggle_state(class_indexing):
    if class_indexing: class_indexing = False
    else: class_indexing = True
    return class_indexing
   

def line_split(line):
    tokens = line.split(",")
    tokens_length = len(tokens)
    keyword = []
    keyword.append(tokens[0].strip())
    for i in range(tokens_length - 1):
        token = tokens[i+1].strip()
        if "see" in token:
            keyword.append(token[4:])
    return keyword


def data_append(parent_class, keyword):
    size = len(keyword)
    sub_class = keyword[0]
    if size > 1: reference = keyword[1]
    else: reference = ""
    data = {
        'Parent Class': parent_class,
        'Sub Class': sub_class,
        'Reference': reference
    }
    return (data)


def line_operation(line, parent_class, class_indexing):
    keyword = line_split(line)
    if class_indexing:
        data = data_append(parent_class, keyword)
    else:
        parent_class = keyword[0]
        data = data_append(parent_class, keyword)
    return (parent_class, data)

    
def data_extraction(line, parent_class, class_indexing):
    line = line.lower()
    
    if line[0] == ',' or (line[0] >= '0' and line[0] <= '9'):
        return (0, parent_class, class_indexing, "")
    
    elif line[0] == '\n':
        class_indexing = toggle_state(class_indexing)
        return (0, parent_class, class_indexing, "")
    
    else:
        parent_class, data = line_operation(line, parent_class, class_indexing)
        return (1, parent_class, class_indexing, data)

def print_processed_data(df):
    parent_class = df[["Parent Class"]]
    sub_class = df[["Sub Class"]]
    reference = df[["Reference"]]

    for i in range(df.shape[0]):
        if parent_class.iloc[i].values[0] == sub_class.iloc[i].values[0]:
            print(sub_class.iloc[i].values[0], ",", reference.iloc[i].values[0])
        else:
            print("\t", sub_class.iloc[i].values[0], ",", reference.iloc[i].values[0])

# Data extraction from index file

In [285]:
def extract_index_data(file_location, df):
#     df = pd.DataFrame(columns=['Parent Class', 'Sub Class', 'Reference'])
    parent_class = ""
    class_indexing = False
    
    file = open(file_location)
    for line in file:
        state, parent_class, class_indexing, data = data_extraction(line, parent_class, class_indexing)
        if state:
            df = df.append(data, ignore_index=True)
    
    save_file = file_location.split("/")[-1].split(".")[0] + ".csv"
    df.to_csv(save_file)
    
    file.close()
    
    return df

In [286]:
df = pd.DataFrame(columns=['Parent Class', 'Sub Class', 'Reference'])
df = extract_index_data(index_file, df)

# maximum length of keyword

In [287]:
parent_class = df[["Parent Class"]]
sub_class = df[["Sub Class"]]
index_length = df.shape[0]

In [288]:
max_length = 1

def cal_max_length(n1, n2):
    if n1 > n2: return n1
    else: return n2

for i in range(index_length):
    x1 = parent_class.iloc[i].values[0]
    x2 = sub_class.iloc[i].values[0]
    if x1 == x2:
        max_length = cal_max_length(max_length, len(x2.split(" ")))
    else:
        max_length = cal_max_length(max_length, len(x2.split(" ")) + len(x1.split(" ")))

# keywords extraction from CSV files

In [289]:
singular_terms = []
different_terms = []

for i in range(index_length):
    x1 = parent_class.iloc[i].values[0]
    x2 = sub_class.iloc[i].values[0]
    if x1 == x2:
        singular_terms.append(x2)
    else:
        terms = [x1, x2]
        different_terms.append(terms)

print(len(singular_terms))
print(len(different_terms))

581
0


# Keyphrase Extraction from Book Chapters

In [290]:
def extract_candidate_keywords(chunkGram, text):
    chunkParser = nltk.RegexpParser(chunkGram)
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    chunked = chunkParser.parse(tagged)
    candidate_keywords = []

    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            candidate_keyword = ' '.join([x for x,y in tree.leaves()])
            candidate_keywords.append(candidate_keyword)
    
    return candidate_keywords

# Useful Data

In [291]:
def clean_phrase(phrase):
    new_string = ""
    for word in phrase.split(" "):
        word = porter.stem(word)
        new_string += word + " "
    return new_string[:-1]

In [302]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

raw = open(chapter_file).read()

chunkGram1 = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}
        
    PHRASE:
        {<NBAR>}
        {<NBAR><IN><NBAR>}
"""

chunkGram2 = r""" PHRASE: 
                {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}
            """

In [303]:
candidate_keywords = extract_candidate_keywords(chunkGram1, raw)

In [304]:
candidate_keywords = [word.lower() for word in candidate_keywords]

In [305]:
f = open("candidate_keyword.txt", 'w+')

for word in candidate_keywords:
    f.write(word)
    f.write("\n")

f.close()

In [306]:
candidate_keywords = set(candidate_keywords)
candidate_keywords = [w for w in candidate_keywords if len(w.split(' ')) < max_length]

print(len(candidate_keywords))

304


In [307]:
filtered_keywords = [clean_phrase(w) for w in candidate_keywords]

filtered_keywords = set(filtered_keywords)

print(len(filtered_keywords))

290


# Matching Singular Keywords from Extracted List

In [308]:
matched_words = []
unmatched_words = []
for w in singular_terms:
    count = 0
    for w1 in filtered_keywords:
        if clean_phrase(w) in w1:
            count += 1
    if count == 0:
        unmatched_words.append(w)
    else:
        matched_words.append(w)

In [309]:
print("matched_words: ", len(set(matched_words)))
print("not matched: ", len(unmatched_words))

matched_words:  26
not matched:  555


In [310]:
print(matched_words)

['adam', 'ais', 'broadcasting', 'contrast', 'deep learning', 'dot product', 'element-wise product', 'example', 'generalization', 'graph', 'hadamard product', 'identity matrix', 'main diagonal', 'matrix', 'matrix inverse', 'matrix product', 'nat', 'operation', 'precision', 'regularization', 'regularizer', 'scalar', 'set', 'tensor', 'transpose', 'vector']


# Matching Parent and Sub Class

In [277]:
def find_best_string(all_strings):
    return max(set(all_strings), key = all_strings.count)


def matching_pattern(keyword1, keyword2, phrase):
#     phrase = clean_phrase(phrase)
    stem1 = clean_phrase(keyword1)
    stem2 = clean_phrase(keyword2)
    if stem1 in phrase and stem2 in phrase:
        len1 = phrase.index(stem1)
        len2 = phrase.index(stem2)
        
        if len1 > len2:
            start = len2 + len(stem2)
            end = len1
            string = keyword2 + phrase[start:end] + keyword1
        else:
            start = len1 + len(stem1)
            end = len2
            string = keyword1 + phrase[start:end] + keyword2
        return 1, string
    else: return 0, ""

count = 0
successful_phrases = []
unsuccessful_phrases = []


for pair in different_terms:
    match_count = 0
    possible_strings = []
    for phrase in filtered_keywords:
        result, string = matching_pattern(pair[0], pair[1], phrase)
        if result != 0:
            possible_strings.append(string)
            match_count += 1
    if match_count > 0:
        main_keyword = find_best_string(possible_strings)
        successful_phrases.append(main_keyword)
        count += 1
        print(main_keyword)
    else:
        unsuccessful_phrases.append(pair)


between-class covariance
within-class covariance
partitioned covariance matrix
conditional entropy
differential entropy
relative entropy
functional derivative
conditional gaussian
gaussian marginal
gaussian mixture
directed graphical model
undirected graphical model
autoregressive hidden markov model
factorial hidden markov model
input-output hidden markov model
left-to-right hidden markov model
extended kalman ﬁlter
gaussian kernel function
fisher linear discriminant
linear regression problem
variational linear regression
bayesian logistic regression
logistic regression mixture model
multiclass logistic regression
margin error
soft margin
homogeneous markov chain
message passing schedule
variational message passing
conditional mixture model
logistic regression mixture model
neural network input imag convolutional
perceptron convergence theorem
perceptron hardware
conjugate prior
consistent gaussian prior
improper prior
noninformative prior
bayesian probability
probability density
prob

In [278]:
print(len(successful_phrases))
print(len(unsuccessful_phrases))

45
88
