# Read Data

In [35]:
import pandas as pd
import re
import operator


In [36]:


def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


In [37]:

def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


In [38]:
def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words



In [39]:
def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
    sentences = sentence_delimiters.split(text)
    return sentences

In [40]:
def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern

In [41]:
def generate_candidate_keywords(sentence_list, stopword_pattern):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "":
                phrase_list.append(phrase)
    return phrase_list


In [42]:
def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        #if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  #orig.
            #word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score

In [43]:
def generate_candidate_keyword_scores(phrase_list, word_score):
    keyword_candidates = {}
    for phrase in phrase_list:
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates

In [44]:

class Rake(object):
    def __init__(self, stop_words_path):
        self.stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)

    def run(self, text):
        sentence_list = split_sentences(text)

        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)

        word_scores = calculate_word_scores(phrase_list)

        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)

        # sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
        sorted_keywords = sorted(keyword_candidates.items(), key=operator.itemgetter(1), reverse=True)
        
        return sorted_keywords


In [45]:
# if test:
#     text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

#     # Split text into sentences
#     sentenceList = split_sentences(text)
#     #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
#     stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
#     stopwordpattern = build_stop_word_regex(stoppath)

#     # generate candidate keywords
#     phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

#     # calculate individual word scores
#     wordscores = calculate_word_scores(phraseList)

#     # generate candidate keyword scores
#     keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
#     if debug: print(keywordcandidates)

#     # sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
#     sortedKeywords = sorted(keywordcandidates, key=operator.itemgetter(1), reverse=True)

#     if debug: print(sortedKeywords)

#     totalKeywords = len(sortedKeywords)
#     if debug: print (totalKeywords)

#     # print (sortedKeywords[0:(totalKeywords / 3)])

#     rake = Rake("SmartStoplist.txt")
#     keywords = rake.run(text)
#     print (keywords)

In [46]:
df_grouped = pd.read_csv('data/dataClean/newgrouped_df.csv')

In [47]:
df_grouped

Unnamed: 0,ID,Title,Keywords,FirstLevel,titleCleanedLem,keywordCleanedLem,titleCleanedStem,keywordCleanedStem,titleRelevantLem,keywordRelevantLem
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H'],integration community process-oriented structure,generation community cooperative process-orien...,process-ori commun structur integr,knowledg knowledge-intens gener wiki process-o...,integration community structure,generation community cooperative knowledge wik...
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J'],outcome synchronously interaction small learni...,cooperative learning workplace computer-suppor...,outcom interact small factor synchron learn on...,assur profession studi empir learn train compu...,outcome synchronously interaction small learni...,cooperative learning workplace training assura...
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K'],weblogs sharing learning knowledge space using...,information system agent learning experience-b...,knowledg share learn weblog space use inform,experience-bas system agent pedagog arrang lea...,weblogs sharing learning knowledge space using...,information system agent learning weblog pedag...
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J'],management oriented pre-built implementing met...,filtering collaborative management business me...,manag knowledg architectur pre-built implement...,manag filter knowledg inform collabor method i...,management oriented implementing method modell...,filtering collaborative management business me...
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H'],novel visualization transfer project map evalu...,communication visualization visual storytellin...,novel evalu visual transfer knowledg project m...,manag visual knowledg inform metaphor project ...,novel visualization transfer project map evalu...,communication visualization visual storytellin...
...,...,...,...,...,...,...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K'],need environment people virtual educational sp...,environment person virtual disability accessib...,educ need digit inclus environ peopl virtual s...,environ person virtual disabl access,need environment people virtual educational sp...,environment person virtual disability accessib...
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K'],management learning web style development appl...,application learning web-based style,manag develop web learn style applic,applic web-bas style learn,management learning web style development appl...,application learning style
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M'],used spiral timeline support e-learning semantic,visualization spiral timeline e-learning moodle,spiral timelin support semant use e-learn,moodl visual spiral timelin e-learn,used spiral timeline support semantic,visualization spiral timeline moodle
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K'],visualization tree course syntax language proc...,visualization tree educational software syntax,visual tree languag syntax cours process,educ visual tree syntax softwar,visualization tree course syntax language proc...,visualization tree educational software syntax


In [48]:
c_grouped_t = df_grouped['titleCleanedStem']
c_grouped_k = df_grouped['keywordCleanedStem']

In [49]:
debug = False
test = True

In [50]:
def getWordsOfRakeAlgorithm(c_grouped):

    keywordsList = []
    for text in c_grouped:
        # text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

        # Split text into sentences
        sentenceList = split_sentences(text)
       
        #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
        stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
        stopwordpattern = build_stop_word_regex(stoppath)

        # generate candidate keywords
        phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

        # calculate individual word scores
        wordscores = calculate_word_scores(phraseList)

        # generate candidate keyword scores
        keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
        if debug: print(keywordcandidates)

        # sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
        sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)

        if debug: print(sortedKeywords)

        totalKeywords = len(sortedKeywords)
        if debug: print (totalKeywords)

        # print (sortedKeywords[0:(totalKeywords / 3)])

        rake = Rake("SmartStoplist.txt")
        keywords = rake.run(text)
        # print (keywords)
        keywordsList.append(keywords)
    
    return keywordsList

### Cleaned Title using Rake Algorithm 

In [51]:
rakeTitle = getWordsOfRakeAlgorithm(list(c_grouped_t))

### Cleaned Keyword using Rake Algorithm

In [52]:
rakeKeyword = getWordsOfRakeAlgorithm(list(c_grouped_k))

In [58]:
def setRakeInPreWords(rakeList):
    preWords = []
    for r_title in rakeList:
        print(r_title[0][0])
        preWords.append(r_title[0][0])
    return preWords

In [54]:
rakeKeyword

[[('knowledg knowledge-intens gener wiki process-ori commun structur process cooper',
   81.0)],
 [('assur profession studi empir learn train computer-support qualiti workplac cooper',
   100.0)],
 [('experience-bas system agent pedagog arrang learn weblog micro-didact space wiki inform',
   121.0)],
 [('manag filter knowledg inform collabor method introduct retriev ontolog context-awar busi process model',
   169.0)],
 [('manag visual knowledg inform metaphor project commun storytel', 64.0)],
 [('manag knowledg workflow', 9.0)],
 [('manag knowledg inter-organiz network collabor busi', 36.0)],
 [('manag knowledg knowledge-intens descript k-model languag process-ori busi process model',
   100.0)],
 [('knowledg manag research market enterpris tool softwar busi process',
   81.0)],
 [('manag knowledg theori busi instrument activ work infrastructur stanc process model',
   121.0)],
 [('manag knowledg system data base inform', 36.0)],
 [('process manag knowledg servic', 16.0)],
 [('prover 

### Rake Title

In [59]:
rakeTitle = setRakeInPreWords(rakeTitle)

process-ori commun structur integr
outcom interact small factor synchron learn onlin accept workplac determin group
knowledg share learn weblog space
manag knowledg architectur pre-built implement method space orient inform process model
evalu visual transfer knowledg project map long-term applic tube
manag knowledg activity-bas system reconcil approach workflow
human toolkit knowledge-intens organis design network factor methodolog km technolog integr
knowledge-intens process kmdl improv analys busi captur
knowledg manag enterpris role solut busi process
knowledg design work infrastructur model
knowledg call-centr hierarchi infrastructur process model
manag knowledg servic orient approach base process
non-block concurr algorithm queue formal construct
observ investig atom
asynchron consid verifi relax atom case mechan commun correct
compens precis transact bpel applic busi model
understand system replic broadcast atom advantag quorum
manifesto atom
develop softwar atom
first-class ato

In [60]:
rakeTitle

['process-ori commun structur integr',
 'outcom interact small factor synchron learn onlin accept workplac determin group',
 'knowledg share learn weblog space',
 'manag knowledg architectur pre-built implement method space orient inform process model',
 'evalu visual transfer knowledg project map long-term applic tube',
 'manag knowledg activity-bas system reconcil approach workflow',
 'human toolkit knowledge-intens organis design network factor methodolog km technolog integr',
 'knowledge-intens process kmdl improv analys busi captur',
 'knowledg manag enterpris role solut busi process',
 'knowledg design work infrastructur model',
 'knowledg call-centr hierarchi infrastructur process model',
 'manag knowledg servic orient approach base process',
 'non-block concurr algorithm queue formal construct',
 'observ investig atom',
 'asynchron consid verifi relax atom case mechan commun correct',
 'compens precis transact bpel applic busi model',
 'understand system replic broadcast atom a

### Rake Keyword

In [61]:
rakeKeyword = setRakeInPreWords(rakeKeyword)

knowledg knowledge-intens gener wiki process-ori commun structur process cooper
assur profession studi empir learn train computer-support qualiti workplac cooper
experience-bas system agent pedagog arrang learn weblog micro-didact space wiki inform
manag filter knowledg inform collabor method introduct retriev ontolog context-awar busi process model
manag visual knowledg inform metaphor project commun storytel
manag knowledg workflow
manag knowledg inter-organiz network collabor busi
manag knowledg knowledge-intens descript k-model languag process-ori busi process model
knowledg manag research market enterpris tool softwar busi process
manag knowledg theori busi instrument activ work infrastructur stanc process model
manag knowledg system data base inform
process manag knowledg servic
prover formal concurr proof atom refin
formal develop method atom observ refin
asynchron relax verif atom commun process algebra
formal compens ing method languag transact long-runn semant specif
system i

In [62]:
rakeKeyword

['knowledg knowledge-intens gener wiki process-ori commun structur process cooper',
 'assur profession studi empir learn train computer-support qualiti workplac cooper',
 'experience-bas system agent pedagog arrang learn weblog micro-didact space wiki inform',
 'manag filter knowledg inform collabor method introduct retriev ontolog context-awar busi process model',
 'manag visual knowledg inform metaphor project commun storytel',
 'manag knowledg workflow',
 'manag knowledg inter-organiz network collabor busi',
 'manag knowledg knowledge-intens descript k-model languag process-ori busi process model',
 'knowledg manag research market enterpris tool softwar busi process',
 'manag knowledg theori busi instrument activ work infrastructur stanc process model',
 'manag knowledg system data base inform',
 'process manag knowledg servic',
 'prover formal concurr proof atom refin',
 'formal develop method atom observ refin',
 'asynchron relax verif atom commun process algebra',
 'formal compen

## Search word repeated in the Title and Keyword

In [None]:
# import difflib


In [None]:
# def findSimilarOrIntersecting(lista1, lista2):
#     palabras_similares = []
    
#     for palabra1 in lista1:
#         for palabra2 in lista2:
#             s = difflib.SequenceMatcher(None, palabra1, palabra2)
#             if s.ratio() > 0.8:  # Puedes ajustar el umbral según tus necesidades
#                 palabras_similares.append(palabra1)
#                 break  # Rompe el bucle interno una vez que se encuentra una palabra similar
    
#     return palabras_similares

# def searchRepeatedWord(rakeTitle,rakeKeyword):

#     similarityList = []
#     for i in range(len(rakeTitle)):
#         list1 = rakeTitle[i].split()
#         list2 = rakeKeyword[i].split()
#         similarity = findSimilarOrIntersecting(list1, list2)
#         similarityList.append(similarity)
#     return similarityList


    

In [None]:
# similarityList = searchRepeatedWord(rakeTitle,rakeKeyword)

In [None]:
# for i in range(len(rakeTitle)):
#     print("words title: ",rakeTitle[i], "\nwords keyword: ", rakeKeyword[i], "\nwords repeated: ", similarityList[i])

words title:  integration community structure 
words keyword:  generation community cooperative knowledge wiki structure process 
words repeated:  ['community', 'structure']
words title:  outcome synchronously interaction small learning workplace determining factor online acceptance group 
words keyword:  cooperative learning workplace training assurance study empirical professional quality 
words repeated:  ['learning', 'workplace']
words title:  information weblogs sharing learning knowledge space 
words keyword:  information system agent learning weblog pedagogical space wiki arrangement 
words repeated:  ['information', 'weblogs', 'learning', 'space']
words title:  management oriented implementing method modelling knowledge space architecture information process 
words keyword:  filtering collaborative management business method introduction modelling knowledge ontology information process retrieval 
words repeated:  ['management', 'method', 'modelling', 'knowledge', 'information',

## Removing Repeated Word

In [None]:
# def removeWordFromString(titleArr, keywordArr, interceptArr):
    
#     auxArr = titleArr.copy()
#     titleArrClean = []
#     for item1, item2 in zip(auxArr, interceptArr):
        
#         item1 = item1.split()

#         for wordRep in item2:
#             idx = item1.index(wordRep)
#             deleted_word = item1.pop(idx)
        
#         r = ' '.join(item1)
#         titleArrClean.append(r)

#     return titleArrClean
        
#     # return result_string

In [None]:
# titleArrClean = removeWordFromString(rakeTitle,rakeKeyword,similarityList)

In [None]:
# len(titleArrClean)

1102

In [None]:
# for i in range(len(titleArrClean)):
#     print("words title: ",titleArrClean[i], "\nwords keyword: ", rakeKeyword[i], "\nwords repeated: ", similarityList[i])

words title:  integration 
words keyword:  generation community cooperative knowledge wiki structure process 
words repeated:  ['community', 'structure']
words title:  outcome synchronously interaction small determining factor online acceptance group 
words keyword:  cooperative learning workplace training assurance study empirical professional quality 
words repeated:  ['learning', 'workplace']
words title:  sharing knowledge 
words keyword:  information system agent learning weblog pedagogical space wiki arrangement 
words repeated:  ['information', 'weblogs', 'learning', 'space']
words title:  oriented implementing space architecture 
words keyword:  filtering collaborative management business method introduction modelling knowledge ontology information process retrieval 
words repeated:  ['management', 'method', 'modelling', 'knowledge', 'information', 'process']
words title:  transfer map evaluation application tube 
words keyword:  communication visualization visual storytelling 

## Split of Words

In [None]:
# ff = 0
# for i in range(len(titleArrClean)):
#     if len(titleArrClean[i]) == 0:
#         listK = rakeKeyword[i].split()
#         half_length = len(listK) // 2
#         first_half = listK[:half_length]
#         second_half = listK[half_length:]
#         first_half = ' '.join(first_half)
#         second_half = ' '.join(second_half)

#         titleArrClean[i] = first_half
#         rakeKeyword[i] = second_half
#         print(ff)
#         print(listK)
#         print(first_half)
#         print(second_half)
#         ff+=1


0
['atomicity', 'software', 'development', 'transaction', 'monitor', 'ocl', 'uml', 'concurrency']
atomicity software development transaction
monitor ocl uml concurrency
1
['generation', 'testing', 'algorithm', 'genetic', 'flow', 'software', 'automatic', 'data', 'test']
generation testing algorithm genetic
flow software automatic data test
2
['computing', 'mining', 'clustering', 'data', 'stream']
computing mining
clustering data stream
3
['root', 'polynomial', 'positive', 'equation', 'bound']
root polynomial
positive equation bound
4
['randomness', 'set', 'algorithmic', 'binary', 'theory', 'sequence', 'information', 'random']
randomness set algorithmic binary
theory sequence information random
5
['principle', 'set', 'partial', 'constructive', 'realizability', 'combinatory', 'theory', 'brouwerian', 'algebra']
principle set partial constructive
realizability combinatory theory brouwerian algebra
6
['continuity', 'constructive', 'mathematics']
continuity
constructive mathematics
7
['comput

## Joining Title and Keywords

In [63]:
def joinTitleKeyword(titleArr, keywordArr):
    joinArr = []
    for item1, item2 in zip(titleArr, keywordArr):
        joinArr.append(item1+" "+ item2)
    return joinArr

In [64]:
# joinArr = joinTitleKeyword(titleArrClean, rakeKeyword)
joinArr = joinTitleKeyword(rakeTitle, rakeKeyword)

In [65]:
# for i in range(len(titleArrClean)):
#     print("words title: ",titleArrClean[i], "\nwords keyword: ", rakeKeyword[i], "\nwords joined: ", joinArr[i])

for i in range(len(rakeTitle)):
    print("words title: ",rakeTitle[i], "\nwords keyword: ", rakeKeyword[i], "\nwords joined: ", joinArr[i])

words title:  process-ori commun structur integr 
words keyword:  knowledg knowledge-intens gener wiki process-ori commun structur process cooper 
words joined:  process-ori commun structur integr knowledg knowledge-intens gener wiki process-ori commun structur process cooper
words title:  outcom interact small factor synchron learn onlin accept workplac determin group 
words keyword:  assur profession studi empir learn train computer-support qualiti workplac cooper 
words joined:  outcom interact small factor synchron learn onlin accept workplac determin group assur profession studi empir learn train computer-support qualiti workplac cooper
words title:  knowledg share learn weblog space 
words keyword:  experience-bas system agent pedagog arrang learn weblog micro-didact space wiki inform 
words joined:  knowledg share learn weblog space experience-bas system agent pedagog arrang learn weblog micro-didact space wiki inform
words title:  manag knowledg architectur pre-built implement 

In [66]:
df_data = pd.DataFrame() 

columns_to_copy = ['ID','Title','Keywords','FirstLevel']

# Copy the specified columns from df1 to df2
for column in columns_to_copy:
    df_data[column] = df_grouped[column].copy()


In [67]:
df_data

Unnamed: 0,ID,Title,Keywords,FirstLevel
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H']
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J']
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K']
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J']
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H']
...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K']
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K']
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M']
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K']


In [68]:
# df_data['Processed_Title'] = titleArrClean
# df_data['Processed_Keyword'] = rakeKeyword
# df_data['Processed_T_K'] = joinArr

df_data['Processed_Title'] = rakeTitle
df_data['Processed_Keyword'] = rakeKeyword
df_data['Processed_T_K'] = joinArr

In [69]:
df_data

Unnamed: 0,ID,Title,Keywords,FirstLevel,Processed_Title,Processed_Keyword,Processed_T_K
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H'],process-ori commun structur integr,knowledg knowledge-intens gener wiki process-o...,process-ori commun structur integr knowledg kn...
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J'],outcom interact small factor synchron learn on...,assur profession studi empir learn train compu...,outcom interact small factor synchron learn on...
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K'],knowledg share learn weblog space,experience-bas system agent pedagog arrang lea...,knowledg share learn weblog space experience-b...
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J'],manag knowledg architectur pre-built implement...,manag filter knowledg inform collabor method i...,manag knowledg architectur pre-built implement...
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H'],evalu visual transfer knowledg project map lon...,manag visual knowledg inform metaphor project ...,evalu visual transfer knowledg project map lon...
...,...,...,...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K'],digit inclus environ peopl virtual special edu...,environ person virtual disabl access,digit inclus environ peopl virtual special edu...
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K'],manag develop web learn style applic,applic web-bas style learn,manag develop web learn style applic applic we...
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M'],spiral timelin support semant,moodl visual spiral timelin e-learn,spiral timelin support semant moodl visual spi...
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K'],visual tree languag syntax cours process,educ visual tree syntax softwar,visual tree languag syntax cours process educ ...


# Load in CSV

In [70]:
import os

In [71]:
output = os.path.join("data/dataClean/", "df_Processed_Data.csv")
df_data.to_csv(output, index=False)

In [72]:
df_Processed_Data = pd.read_csv('data/dataClean/df_Processed_Data.csv')
df_Processed_Data

Unnamed: 0,ID,Title,Keywords,FirstLevel,Processed_Title,Processed_Keyword,Processed_T_K
0,1,Integration of Communities into Process-Orient...,"cooperative knowledge generation,knowledge co...",['H'],process-ori commun structur integr,knowledg knowledge-intens gener wiki process-o...,process-ori commun structur integr knowledg kn...
1,3,Small Groups Learning Synchronously Online at ...,"professional training,workplace learning,compu...",['H' 'J'],outcom interact small factor synchron learn on...,assur profession studi empir learn train compu...,outcom interact small factor synchron learn on...
2,4,Using Weblogs for Knowledge Sharing and Learni...,"Experience-based Information System,wiki,weblo...",['A' 'D' 'H' 'J' 'K'],knowledg share learn weblog space,experience-bas system agent pedagog arrang lea...,knowledg share learn weblog space experience-b...
3,5,Modelling and Implementing Pre-built Informati...,"modelling method,introduction method,context-a...",['H' 'I' 'J'],manag knowledg architectur pre-built implement...,manag filter knowledg inform collabor method i...,manag knowledg architectur pre-built implement...
4,6,Tube Map Visualization: Evaluation of a Novel ...,"knowledge visualization,information visualiza...",['H'],evalu visual transfer knowledg project map lon...,manag visual knowledg inform metaphor project ...,evalu visual transfer knowledg project map lon...
...,...,...,...,...,...,...,...
1097,1473,Eduquito: Virtual Environment for Digital Incl...,"virtual environment,accessibility,persons with...",['K'],digit inclus environ peopl virtual special edu...,environ person virtual disabl access,digit inclus environ peopl virtual special edu...
1098,1474,Development of a Web Application for Managemen...,"learning styles,Web-based application",['K'],manag develop web learn style applic,applic web-bas style learn,manag develop web learn style applic applic we...
1099,1475,Semantic Spiral Timelines Used as Support for ...,"visualization,e-learning,timeline,spiral,Moodle",['L' 'M'],spiral timelin support semant,moodl visual spiral timelin e-learn,spiral timelin support semant moodl visual spi...
1100,1476,Visualization of Syntax Trees for Language Pro...,"syntax trees,visualization,educational software",['D' 'K'],visual tree languag syntax cours process,educ visual tree syntax softwar,visual tree languag syntax cours process educ ...
