# Read Data

In [2]:
import pandas as pd
import re
import operator


In [3]:


def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


In [4]:

def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


In [5]:
def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words



In [6]:
def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
    sentences = sentence_delimiters.split(text)
    return sentences

In [7]:
def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern

In [8]:
def generate_candidate_keywords(sentence_list, stopword_pattern):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "":
                phrase_list.append(phrase)
    return phrase_list


In [9]:
def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        #if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  #orig.
            #word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score

In [10]:
def generate_candidate_keyword_scores(phrase_list, word_score):
    keyword_candidates = {}
    for phrase in phrase_list:
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates

In [68]:

class Rake(object):
    def __init__(self, stop_words_path):
        self.stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)

    def run(self, text):
        sentence_list = split_sentences(text)

        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)

        word_scores = calculate_word_scores(phrase_list)

        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)

        # sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
        sorted_keywords = sorted(keyword_candidates.items(), key=operator.itemgetter(1), reverse=True)
        
        return sorted_keywords


In [27]:
# if test:
#     text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

#     # Split text into sentences
#     sentenceList = split_sentences(text)
#     #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
#     stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
#     stopwordpattern = build_stop_word_regex(stoppath)

#     # generate candidate keywords
#     phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

#     # calculate individual word scores
#     wordscores = calculate_word_scores(phraseList)

#     # generate candidate keyword scores
#     keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
#     if debug: print(keywordcandidates)

#     # sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
#     sortedKeywords = sorted(keywordcandidates, key=operator.itemgetter(1), reverse=True)

#     if debug: print(sortedKeywords)

#     totalKeywords = len(sortedKeywords)
#     if debug: print (totalKeywords)

#     # print (sortedKeywords[0:(totalKeywords / 3)])

#     rake = Rake("SmartStoplist.txt")
#     keywords = rake.run(text)
#     print (keywords)

In [163]:
df_grouped = pd.read_csv('data/dataClean/grouped_df.csv')

In [164]:
df_grouped

Unnamed: 0,ID,Title,Keywords,FirstLevel
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H']
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J']
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K']
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J']
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H']
...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K']
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K']
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M']
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K']


In [165]:
c_grouped_t = df_grouped['Title']
c_grouped_k = df_grouped['Keywords']

In [166]:
debug = False
test = True

In [167]:
def getWordsOfRakeAlgorithm(c_grouped):

    keywordsList = []
    i = 1
    for text in c_grouped:
        # text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
        if debug: print(i)
        if debug: print(text)
        # Split text into sentences
        sentenceList = split_sentences(text)
       
        #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
        stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
        stopwordpattern = build_stop_word_regex(stoppath)

        # generate candidate keywords
        phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

        # calculate individual word scores
        wordscores = calculate_word_scores(phraseList)

        # generate candidate keyword scores
        keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
        if debug: print("Keyword Candidates: ", keywordcandidates)

        # sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
        sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)

        if debug: print("Sorted Keywords: ", sortedKeywords)

        totalKeywords = len(sortedKeywords)
        if debug: print("Total Keywords: ", totalKeywords)

        # print (sortedKeywords[0:(totalKeywords / 3)])

        rake = Rake("SmartStoplist.txt")
        keywords = rake.run(text)
        # print (keywords)
        keywordsList.append(keywords)
        i += 1
    
    return keywordsList

### Cleaned Title using Rake Algorithm 

In [168]:
rakeTitle = getWordsOfRakeAlgorithm(list(c_grouped_t))

### Cleaned Keyword using Rake Algorithm

In [169]:
rakeKeyword = getWordsOfRakeAlgorithm(list(c_grouped_k))

In [None]:
rakeKeyword

In [170]:
def setRakeInPreWords(rakeList):
    preWords = []
    for r_title in rakeList:
        print(r_title[0][0])
        preWords.append(r_title[0][0])
    return preWords

### Rake Title

In [171]:
rakeTitle = setRakeInPreWords(rakeTitle)

process-oriented structures
small groups learning synchronously online
knowledge sharing
implementing pre-built information spaces
tube map visualization
activity-based knowledge management approach
integrate technological
improving knowledge-intensive business processes
knowledge management solutions
modeling knowledge work
knowledge infrastructure hierarchy model
process oriented knowledge management
non-blocking concurrent queue algorithm
investigating atomicity
asynchronous communication mechanism
compensating business transactions
atomic broadcast
atomic manifesto
software development
first-class system provision
semi-automatic visual subgroup mining
visualizing recommendation flow
high-dimensional data
ubiquitous data mining
visual data exploration
uncertain dependencies
explore highly structured temporal data
scalable visual data exploration
physically locating wireless intruders
security level evaluation
reduced distortion lsb coding
cryptographic protocol intrusion detection
s

In [172]:
rakeTitle

['process-oriented structures',
 'small groups learning synchronously online',
 'knowledge sharing',
 'implementing pre-built information spaces',
 'tube map visualization',
 'activity-based knowledge management approach',
 'integrate technological',
 'improving knowledge-intensive business processes',
 'knowledge management solutions',
 'modeling knowledge work',
 'knowledge infrastructure hierarchy model',
 'process oriented knowledge management',
 'non-blocking concurrent queue algorithm',
 'investigating atomicity',
 'asynchronous communication mechanism',
 'compensating business transactions',
 'atomic broadcast',
 'atomic manifesto',
 'software development',
 'first-class system provision',
 'semi-automatic visual subgroup mining',
 'visualizing recommendation flow',
 'high-dimensional data',
 'ubiquitous data mining',
 'visual data exploration',
 'uncertain dependencies',
 'explore highly structured temporal data',
 'scalable visual data exploration',
 'physically locating wirel

### Rake Keyword

In [173]:
rakeKeyword = setRakeInPreWords(rakeKeyword)

cooperative knowledge generation
computer-supported cooperative learning
micro-didactical learning arrangement
modelling method
knowledge visualization
knowledge management
inter-organizational networked businesses
knowledge modeling description language
knowledge management
knowledge management instrument
knowledge management
knowledge management service
formal proof
formal development method
asynchronous communication
ing transactions
atomic broadcast
formal methods
software development
transactional memory
subgroup mining
visualizing information flow
grand tour methods
data mining
visual data exploration
data mining
interactive information visualization
visual data exploration
intrusion detection system
public key infrastructure
audio steganography
cryptographic protocol abuse
ad hoc network
evolutionary computation
certified delivery
multi-agent systems
digital  forensics
data protection
multimedia information system
distributed databases
discrete event dynamic systems
software lif

In [147]:
rakeKeyword

['generation community cooperative knowledge wiki structure process',
 'cooperative learning workplace training assurance study empirical professional quality',
 'information system agent learning weblog pedagogical space wiki arrangement',
 'filtering collaborative management business method introduction modelling knowledge ontology information process retrieval',
 'communication visualization visual storytelling management metaphor project knowledge information',
 'knowledge management workflow',
 'networked collaborative management business network knowledge',
 'management business knowledge modeling description language process',
 'research market management business knowledge software tool enterprise process',
 'activity management business stance instrument knowledge theory modeling work infrastructure process',
 'management system knowledge data base information',
 'service management knowledge process',
 'refinement prover formal atomicity proof concurrency',
 'observability re

## Joining Title and Keywords

In [174]:
def joinTitleKeyword(titleArr, keywordArr):
    joinArr = []
    for item1, item2 in zip(titleArr, keywordArr):
        joinArr.append(item1+" "+ item2)
    return joinArr

In [175]:
# joinArr = joinTitleKeyword(titleArrClean, rakeKeyword)
joinArr = joinTitleKeyword(rakeTitle, rakeKeyword)

In [176]:
# for i in range(len(titleArrClean)):
#     print("words title: ",titleArrClean[i], "\nwords keyword: ", rakeKeyword[i], "\nwords joined: ", joinArr[i])

for i in range(len(rakeTitle)):
    print("words title: ",rakeTitle[i], "\nwords keyword: ", rakeKeyword[i], "\nwords joined: ", joinArr[i])

words title:  process-oriented structures 
words keyword:  cooperative knowledge generation 
words joined:  process-oriented structures cooperative knowledge generation
words title:  small groups learning synchronously online 
words keyword:  computer-supported cooperative learning 
words joined:  small groups learning synchronously online computer-supported cooperative learning
words title:  knowledge sharing 
words keyword:  micro-didactical learning arrangement 
words joined:  knowledge sharing micro-didactical learning arrangement
words title:  implementing pre-built information spaces 
words keyword:  modelling method 
words joined:  implementing pre-built information spaces modelling method
words title:  tube map visualization 
words keyword:  knowledge visualization 
words joined:  tube map visualization knowledge visualization
words title:  activity-based knowledge management approach 
words keyword:  knowledge management 
words joined:  activity-based knowledge management appr

## Set New Title and Keyword Processing

In [177]:
df_data = pd.DataFrame() 

columns_to_copy = ['ID','Title','Keywords','FirstLevel']

# Copy the specified columns from df1 to df2
for column in columns_to_copy:
    df_data[column] = df_grouped[column].copy()


In [178]:
df_data

Unnamed: 0,ID,Title,Keywords,FirstLevel
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H']
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J']
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K']
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J']
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H']
...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K']
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K']
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M']
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K']


In [179]:
# df_data['Processed_Title'] = titleArrClean
# df_data['Processed_Keyword'] = rakeKeyword
# df_data['Processed_T_K'] = joinArr

df_data['Processed_Title'] = rakeTitle
df_data['Processed_Keyword'] = rakeKeyword
df_data['Processed_T_K'] = joinArr

In [180]:
df_data

Unnamed: 0,ID,Title,Keywords,FirstLevel,Processed_Title,Processed_Keyword,Processed_T_K
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H'],process-oriented structures,cooperative knowledge generation,process-oriented structures cooperative knowle...
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J'],small groups learning synchronously online,computer-supported cooperative learning,small groups learning synchronously online com...
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K'],knowledge sharing,micro-didactical learning arrangement,knowledge sharing micro-didactical learning ar...
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J'],implementing pre-built information spaces,modelling method,implementing pre-built information spaces mode...
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H'],tube map visualization,knowledge visualization,tube map visualization knowledge visualization
...,...,...,...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K'],virtual environment,virtual environment,virtual environment virtual environment
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K'],web application,learning styles,web application learning styles
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M'],semantic spiral timelines,visualization,semantic spiral timelines visualization
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K'],language processing courses,syntax trees,language processing courses syntax trees


# Load in CSV

In [181]:
import os

In [182]:
output = os.path.join("data/dataClean/", "df_Processed_Data.csv")
df_data.to_csv(output, index=False)

In [183]:
df_Processed_Data = pd.read_csv('data/dataClean/df_Processed_Data.csv')
df_Processed_Data

Unnamed: 0,ID,Title,Keywords,FirstLevel,Processed_Title,Processed_Keyword,Processed_T_K
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H'],process-oriented structures,cooperative knowledge generation,process-oriented structures cooperative knowle...
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J'],small groups learning synchronously online,computer-supported cooperative learning,small groups learning synchronously online com...
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K'],knowledge sharing,micro-didactical learning arrangement,knowledge sharing micro-didactical learning ar...
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J'],implementing pre-built information spaces,modelling method,implementing pre-built information spaces mode...
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H'],tube map visualization,knowledge visualization,tube map visualization knowledge visualization
...,...,...,...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K'],virtual environment,virtual environment,virtual environment virtual environment
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K'],web application,learning styles,web application learning styles
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M'],semantic spiral timelines,visualization,semantic spiral timelines visualization
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K'],language processing courses,syntax trees,language processing courses syntax trees


In [187]:
print(df_Processed_Data.iloc[1097]['Title'])
print(df_Processed_Data.iloc[1097]['Keywords'])
print(df_Processed_Data.iloc[1097]['Processed_Title'])
print(df_Processed_Data.iloc[1097]['Processed_Keyword'])
print(df_Processed_Data.iloc[1097]['Processed_T_K'])

Eduquito: Virtual Environment for Digital Inclusion of People with Special Educational Needs
virtual environment,accessibility,persons with disabilities
virtual environment
virtual environment
virtual environment virtual environment
