In [127]:
import glob, os, re, string, itertools

from nltk.stem.porter import *
from collections import defaultdict
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import wordnet, stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from pandas import DataFrame
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
import numpy as np
from scipy.sparse.csr import csr_matrix

In [128]:
def clean(input_list):
    result=[]
    #remove unwanted character per line
    for line in input_list:
        clean=re.sub("(\.)?\n",'', line) #remove \n
        clean=re.sub("'s",'', clean) #remove [2]
        clean=re.sub("\[([0-9]{1,2}\,?\s?)+\]",'', clean) #remove [2]
        clean=re.sub("\(([0-9]{1,2}\,?\s?)+\)",'', clean) #remove (2)
        #remove fig. 2 etc, need improvement to catch the sentence after it
        clean=re.sub("([Ff]ig.|[Ff]igure|[Tt]ab.|[Tt]able)\s?[0-9]{1,2}",'', clean) #remove fig. 2 etc
        result.append(clean.lower())
    return result

In [129]:
def calculate_ngram_tfidf(corpus):
    stemmer=PorterStemmer()
    class NewTfidfVectorizer(TfidfVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            tokens = super(TfidfVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        if len(split_words)==1 and len(split_words[0])>2:
                            new_tokens.append(stemmer.stem(token))
                        elif len(split_words[0])<3 and len(split_words[1])<3:
                            del(token)
                        else:
                            new_tokens.append(' '.join(list(stemmer.stem(word) for word in word_tokenize(token))))
                return new_tokens
            return tokens
    stop_words=text.ENGLISH_STOP_WORDS
    tfidf=NewTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-Za-z-]+\b")
    matrix=tfidf.fit_transform(corpus)
    vocab_perdoc=tfidf.inverse_transform(matrix)
    return vocab_perdoc


In [158]:
def calculate_custom_ngram_tfidf(corpus,voc):
    stemmer=PorterStemmer()
    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_tokenizer(self):
            tokenizer=super(TfidfVectorizer, self).build_tokenizer()
            return lambda doc: (stemmer.stem(token) for token in tokenizer(doc))
    #class StemmedTfidfVectorizer(TfidfVectorizer):
    #    def build_analyzer(self):
    #        analyzer=super(TfidfVectorizer, self).build_analyzer()
    #        return lambda doc: (stemmer.stem(word) for word in analyzer(doc))
    stop_words=text.ENGLISH_STOP_WORDS
    tfidf=StemmedTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words, vocabulary=voc)
    matrix=tfidf.fit_transform(corpus)
    vocab_perdoc=tfidf.inverse_transform(matrix)
    return vocab_perdoc

In [159]:
#generate noun phrases based on corpus, just only noun phrase, not the keywords
#search to find sentence boundary
def vocabulary_nounphrases(raw_data):
    stemmer=PorterStemmer()
    #from http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
    #grammar=r'NP: {(<JJ.*>* <NN.*>+ <IN>)? <JJ.*>* <NN.*>+}' #only detect noun phrases that contain specific pattern, hypen word is counted as one NN
    grammar=r'NP: {(<NN.*>|<JJ.*>|<VBN>|<NN> <IN>|<NNS> <IN>)* (<NN.*>|<VBG>)}' 
    punct = set(string.punctuation) #list of punctuation
    stop_words=text.ENGLISH_STOP_WORDS
    chunker = RegexpParser(grammar) #chunker from nltk
    def lambda_unpack(f):
        return lambda args:f(*args)
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) #tokenize and create pos tag per sentenc
    noun_phrases = list(chain.from_iterable(tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) 
    merged_nounphrase = [' '.join(stemmer.stem(word) for word, pos, chunk in group if re.search(r"(?u)\b[A-Za-z-]+\b", word)).lower() for key, group in
                    itertools.groupby(noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    #merged_nounphrase = [' '.join(stemmer.stem(word) for word, pos, chunk in group if re.search(r"(?u)\b[A-Za-z-]+\b", word)).lower() for key, group in
    #                itertools.groupby(noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    all_nounphrases=[cand for cand in merged_nounphrase
            if len(cand)>2 and cand not in stop_words and not all(char in punct for char in cand)]
    vocabulary=(list(set(all_nounphrases)))
    return vocabulary

In [160]:
def calculate_nounphrase_tfidf(corpus, voc):
    stemmer=PorterStemmer()
    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_analyzer(self):
            analyzer=super(TfidfVectorizer, self).build_analyzer()
            return lambda doc: (stemmer.stem(word) for word in analyzer(doc))
    tfidf=StemmedTfidfVectorizer(vocabulary=voc, ngram_range=(1,5)) 
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    return candidates

In [162]:
#corpus=['A challenging problem faced by researchers and developers of distributed real-time and embedded (DRE) systems is devising and implementing effective adaptive resource management strategies that can meet end-to-end quality of service(QoS) requirements in varying operational conditions.',
#        'HyARM is based on hybrid control theoretic techniques [8],which provide a theoretical framework for designing control of complex system with both continuous and discrete dynamics.']
 
corpus=["Today's video is about the animal. Do not stay in Colchester for longer period!",
       "Machine learning is one of major studies in Computer Science. I love it Colchester",
       "Cat is an animal. Major implication from studies is money#"]

s_corpus=["Today video is about the anim. Do not stay in Colchest for longer period!",
       "Machine learn is one of major studi in Comput Scienc. I love it Colchest",
       "Cat is an anim. Major implic from studi is money#"]

#if document is not be stemmed or follow the rule from vocabulary, they cant match each other
vngram=['video','colchest','stay in colchest','longer period','comput scienc','anim major','major implic']
c_corpus=clean(corpus)
voc=vocabulary_nounphrases(c_corpus)
ngrams=calculate_custom_ngram_tfidf(c_corpus, vngram)
#print(ngrams)
print(voc)
phrases=calculate_custom_ngram_tfidf(c_corpus,voc)
print(phrases)

['major implic from studi', 'today video', 'machin learn', 'money', 'major studi in comput scienc', 'colchest for longer period', 'colchest cat', 'anim']
[array(['anim', 'today video'], dtype='<U28'), array(['machin learn'], dtype='<U28'), array(['anim', 'money'], dtype='<U28')]
