In [2]:
from eigen_tech_project.nlp_models import nltk_pipe

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

## Load in the data:

In [163]:
from os import listdir
from os.path import isfile, join, abspath
import pandas as pd
import re
import nltk

class DataReader:
    def __init__(self, path):
        self.path = path
        
    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return("{}({!r})".format(self.__class__.__name__, self.path))
    
    @property
    def file_names(self):
        '''Returns representation of the DataLoader object'''
        return [f for f in listdir(self.path) if isfile(join(self.path, f))]
    
    @property
    def raw_data(self):
        '''Returns representation of the DataLoader object'''
        return [{"file": int(re.sub("[^0-9]", "", f)), 
                 "body": open(abspath(join(self.path, f)), 'r').read()} for f in self.file_names]
    
class DataParser:
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        
    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return("{}({!r})".format(self.__class__.__name__, self.raw_data, self.splitter))
        
    @property
    def sentences(self):
        '''Returns representation of the DataLoader object'''  
        data = []
        for file in self.raw_data:
            # split each file in sentences:
            sentences = self.splitter.tokenize(file['body'])
            # list of sentence level dictionaries:
            data.extend([{'file': file['file'], 
                          'sentence': sentence} for sentence in sentences])
        # add an index for each sentence:
        data = [dict(item, **{'index': x}) for x, item in enumerate(data)]
        return data

In [492]:
x._sentence_mapper(y[3][2])

['And the Kiev story is heading in the right direction - while we were in Ukraine, Dick, through his tireless and personal intervention, was able to achieve a breakthrough with that government, bringing that facility and others under the Cooperative Threat Reduction program.',
 'We have been told that progress is just around the corner, and that when the Iraqis stand up, we will be able to stand down.',
 'And all the troops in the world won’t be able to force Shia, Sunni, and Kurd to sit down at a table, resolve their differences, and forge a lasting peace.',
 "That's why we were able to reform a death penalty system that was broken.",
 "That's why we were able to give health insurance to children in need.",
 'Every single person willing to work should be able to get job training that leads to a job, and earn a living wage that can pay the bills, and afford child care so their kids have a safe place to go when they work.',
 'In the face of that young student who sleeps just three hours

In [481]:
test = [1,2,3,4,5,6,7,8]

In [484]:
test[[1,2]]

TypeError: list indices must be integers or slices, not list

In [480]:
pd.DataFrame(x, columns=["doc", "sentence", "processed_sentence"])

Unnamed: 0,doc,sentence,processed_sentence
0,abandon,1,[247]
1,abandonment,1,[882]
2,ability,4,"[72, 226, 340, 394]"
3,able,8,"[48, 124, 173, 440, 441, 498, 776, 823]"
4,abortion,1,[876]
...,...,...,...
2495,yo,1,[564]
2496,york,1,[139]
2497,young,14,"[403, 409, 427, 516, 566, 587, 716, 775, 776, ..."
2498,zealous,1,[53]


In [445]:
x = (1,2,3)

In [449]:
x.index

<function tuple.index(value, start=0, stop=9223372036854775807, /)>

## Document->Sentences->Tokens->POS->Lemmas

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import requests

class SentenceProcessor:
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self, sentence):
        self.sentence = sentence
        self.tokenizer = nltk.RegexpTokenizer(r"\w+")
        self.lemmatizer = Lemmatizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.common_words_url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"

    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return ("{}({!r})".format(self.__class__.__name__, self.sentence))
    
    @property
    def tokenized_sentence(self):
        """
        Return a list of sublists with tokens. 
        """
        # tokenization in each sentences
        return self.tokenizer.tokenize(self.sentence.lower())
    
    @property
    def lemmatized_sentence(self):
        """
        Return a list of sublists with lemmas. 
        """
        return self.lemmatizer.lemmas(self.tokenized_sentence)
    
    def remove_stopwords(self, text):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        common_words = requests.get(self.common_words_url).text.split()
        noise = self.stopwords + common_words
        return [w for w in text if w not in noise and w.isalpha()]
    
    @property
    def lemmatized_sentence_no_stop(self):
        """
        Return a list of sublists with interesting lemmas. 
        Ergo: stopword and non-alphabetical removal. 
        """
        return self.remove_stopwords(self.lemmatized_sentence)
    
    @property
    def processed_sentence(self):
        """
        Return a single sentence.
        """
        return " ".join(self.lemmatized_sentence_no_stop)

class Lemmatizer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return("{}({!r})".format(self.__class__.__name__))
    
    def get_wordnet_pos(self, treebank_tag):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN
        
    def get_lemma(self, word_postag_combo):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        return self.lemmatizer.lemmatize(word_postag_combo[0], self.get_wordnet_pos(word_postag_combo[1]))

    def lemmas(self, tokenized_sentence):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        # find the pos tagging for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = nltk.pos_tag(tokenized_sentence)
        # lemmatization using pos tags
        lemmas = [self.get_lemma(word_tag_combo) for word_tag_combo in pos_tokens]
        return lemmas
    

In [36]:
from os import listdir
from os.path import isfile, join, abspath
from operator import add
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re
import nltk

class InvertedIndex:
    def __init__(self, path):
        self.path = path
        self.sentence_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.sentence_processor = SentenceProcessor
        
    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return("{}({!r})".format(self.__class__.__name__, self.path))
    
    @property
    def file_names(self):
        '''Returns representation of the DataLoader object'''
        return [f for f in listdir(self.path) if isfile(join(self.path, f))]
    
    @property
    def raw_data(self):
        '''Returns representation of the DataLoader object'''
        
        return [(int(re.sub("[^0-9]", "", f)), 
                 open(abspath(join(self.path, f)), 'r').read()) for f in self.file_names]
    
    @property
    def sentences(self):
        '''Returns representation of the DataLoader object'''  
        data = []
        for file in self.raw_data:
            # split each file in sentences:
            sentences = [x for x in self.sentence_splitter.tokenize(file[1]) if len(x) > 1]
            # list of sentence level dictionaries:
            data.extend([(file[0], sentence) for sentence in sentences])
            
        # add index:
        ix = [(x,) for x in range(0,len(data))]
        return list(map(add, ix, data))
    
    @property
    def processed_sentences(self):
        '''Returns representation of the DataLoader object'''
        return [sentence + (self.sentence_processor(sentence[2]).processed_sentence,) for sentence in self.sentences]
    
    @property
    def inverted_index(self):
        """Return list with the top x most occuring interesting words, with following elements: (feature_id, occurence)."""
        data = [x[3] for x in self.processed_sentences]
        countvectorizer = CountVectorizer()

        vectorizer = countvectorizer.fit(data)
        doc_term_matrix = vectorizer.transform(data)

        lemmas = vectorizer.get_feature_names()
        frequencies = doc_term_matrix.sum(axis=0).tolist()[0]   
        occurences = [doc_term_matrix[:,i].nonzero()[0].tolist() for i in range(0,len(lemmas))]
        
        return list(zip(lemmas, frequencies, occurences))
    
#     def _sentence_mapper(self, sentence_ids):
#         """Return list with the top x most occuring interesting words, with following elements: (feature_id, occurence)."""
#         return [z for (x,y,z) in self.sentences if x.isin(sentence_ids)]
        
#     def _document_mapper(self, sentence_ids):
#         """Return list with the top x most occuring interesting words, with following elements: (feature_id, occurence)."""
#         return [y for (x,y,z) in self.sentences if x in sentence_ids]
    
    @property
    def solution(self):
        df_input = pd.DataFrame(self.sentences, columns= ["index", "document", "sentence"])
        df_output = pd.DataFrame(self.inverted_index, columns = ["lemma", "frequency", "sentences"])
        
        # map sentence ids to documents:
        df_output["documents"] = [set(df_input.iloc[x].document) for x in df_output.sentences]
        
        # map sentence ids to strings:
        df_output["sentences"] = [df_input.iloc[x].sentence.tolist() for x in df_output.sentences]
        
        df_output = df_output.sort_values("frequency", ascending=False).reset_index(drop=True)
        df_output.to_csv('output.csv', index=False)
        
        return df_output
        

In [21]:
SentenceProcessor("").processed_sentence

''

In [1]:
x = InvertedIndex("data/")
y = x.solution

NameError: name 'InvertedIndex' is not defined

In [2]:
x.inverted_index

NameError: name 'x' is not defined

In [39]:
y.sentences[1]

['A few Tuesdays ago, the American people embraced this seriousness with regards to America’s policy in Iraq.',
 'Iraq is descending into chaos based on ethnic divisions that were around long before American troops arrived.',
 'And a report by our own intelligence agencies has concluded that al Qaeda is successfully using the war in Iraq to recruit a new generation of terrorists for its war on America.',
 'These are serious times for our country, and with their votes two weeks ago, Americans demanded a feasible strategy with defined goals in Iraq ?',
 'The notion that Iraq would quickly and easily become a bulwark of flourishing democracy in the Middle East was not a plan for victory, but an ideological fantasy.',
 'I said then and believe now that Saddam Hussein was a ruthless dictator who craved weapons of mass destruction but posed no imminent threat to the United States; that a war in Iraq would harm, not help, our efforts to defeat al Qaeda and finish the job in Afghanistan; and t

In [54]:
pd.DataFrame(y)

Unnamed: 0,0,1,2
0,abandon,1,[246]
1,abandonment,1,[881]
2,ability,4,"[71, 225, 339, 393]"
3,able,8,"[48, 123, 172, 439, 440, 497, 775, 822]"
4,abortion,1,[875]
...,...,...,...
2495,yo,1,[563]
2496,york,1,[138]
2497,young,14,"[402, 408, 426, 515, 565, 586, 715, 774, 775, ..."
2498,zealous,1,[53]


In [57]:
x._document_mapper([1,2,3])

[6, 6, 6]

In [36]:
ix = [(x,) for x in range(0,10)]

In [37]:
ix

[(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)]

In [19]:
min([len(z[1]) for z in y if len(z[1])>1])

9

In [20]:
[z[1] for z in y if len(z[1])==9]

['I get it.']

In [11]:
x._document_mapper(y[3][2])

[6, 5, 5, 1, 1, 1, 2, 2]

## Construct the Inverted Index

In [436]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def calc_inverted_index(data):
    """Return list with the top x most occuring interesting words, with following elements: (feature_id, occurence)."""
    countvectorizer = CountVectorizer()

    vectorizer = countvectorizer.fit(data)
    doc_term_matrix = vectorizer.transform(data)
    
    df = pd.DataFrame()
    
    df["lemma"] = vectorizer.get_feature_names()
    df["frequency"] = doc_term_matrix.sum(axis=0).tolist()[0]   
    df["occurence"] = [doc_term_matrix[:,i].nonzero()[0].tolist() for i in df.index]

    return df.sort_values('frequency', ascending=False).to_dict(orient='records')


In [442]:
def ProblemSolver(path):
    # load the data:
#     raw_data = DataReader(path).raw_data
#     # parse the data in sentences:
#     sentences = DataParser(raw_data).sentences
    
#     df_sentences = pd.DataFrame(sentences)
#     # process the sentences:
#     df_sentences["processed"] = [Sentence(sentence).processed_sentence for sentence in df_sentences.sentence]

    x = InvertedIndex("data/").processed_sentences
    
    # create the inverted index:
    dict_inverted_index = calc_inverted_index(df_sentences.processed)
    
    # map the sentence ids back to sentences and documents:
    df_inverted_index = pd.DataFrame(dict_inverted_index)
    df_inverted_index["documents"] = [set(df_input.iloc[x].file) for x in df_inverted_index.occurence]
    df_inverted_index["occurence"] = [df_input.iloc[x].sentence.tolist() for x in df_inverted_index.occurence]
    
    # df_inverted_index.to_csv('result.csv', index=False)
    return df_inverted_index

In [443]:
x = ProblemSolver("data/")

In [434]:
x.to_csv('test.csv', index=False)

In [444]:
x

Unnamed: 0,lemma,frequency,occurence,documents
0,american,73,"[But, when I think about what is at stake I am...","{1, 2, 3, 4, 5, 6}"
1,people,69,"[Now, few people understand these challenges b...","{1, 2, 3, 4, 5, 6}"
2,country,68,"[For years, Nunn-Lugar programs have been effe...","{1, 2, 3, 4, 5, 6}"
3,make,67,"[As some of you know, Senator Lugar and I rece...","{1, 2, 3, 4, 5, 6}"
4,time,67,[But this is one story that shows our job is f...,"{1, 2, 3, 4, 5, 6}"
...,...,...,...,...
2495,heroism,1,[The history of Africa is a history of ancient...,{4}
2496,herd,1,[I began to understand and appreciate the dist...,{4}
2497,herculean,1,[Certainly it is not due to lack of effort on ...,{4}
2498,helpless,1,"[As the Avian Influenza outbreak demonstrates,...",{6}


In [402]:
L = [y for y in zip(*x.nonzero())]

In [403]:
b = [(k, [x for x, _ in g]) for k, g in groupby(L, itemgetter(0))]

In [404]:
b

[(0, [0, 0]),
 (1, [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 (2, [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 (3, [3, 3, 3, 3, 3]),
 (4, [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]),
 (5, [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]),
 (6, [6, 6, 6, 6, 6, 6, 6]),
 (7, [7, 7, 7, 7, 7, 7, 7]),
 (8, [8, 8, 8, 8, 8, 8, 8]),
 (9, [9, 9, 9, 9]),
 (10, [10, 10]),
 (11, [11, 11, 11, 11, 11, 11]),
 (12, [12, 12]),
 (13, [13, 13, 13, 13, 13, 13, 13, 13]),
 (14, [14, 14]),
 (15,
  [15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15,
   15]),
 (16, [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]),
 (17, [17, 17, 17, 17, 17, 17, 17, 17, 17, 17]),
 (18, [18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18]),
 (19, [19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19]),
 (20,
  [20,
   20,
   20

In [296]:
%%timeit

[x[:,i].nonzero()[0].tolist() for i in range(0,2500)]

401 ms ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [293]:
%%timeit

x.sum(axis=0).tolist()[0]

142 µs ± 9.69 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [309]:
x.nonzero()

(array([  0,   0,   1, ..., 936, 936, 936], dtype=int32),
 array([ 987, 1453,   69, ..., 2114, 2231, 2368], dtype=int32))

In [311]:
rows, cols = 

In [386]:
indices = x.nonzero()

In [317]:
import itertools
import operator

L = [x for x in indices]

def accumulate(l):
    it = itertools.groupby(l, operator.itemgetter(0))
    for key, subiter in it:
        yield key, sum(item[1] for item in subiter)

In [387]:
L = [x for x in indices]

In [389]:
indices

(array([  0,   0,   1, ..., 936, 936, 936], dtype=int32),
 array([ 987, 1453,   69, ..., 2114, 2231, 2368], dtype=int32))

In [374]:
from operator import itemgetter
from itertools import groupby

In [None]:
import requests

class TextParser:
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.RegexpTokenizer(r"\w+")
        self.lemmatizer = LemmatizationWithPOSTagger()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.common_words_url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"

    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return ("{}({!r})".format(self.__class__.__name__,self.raw_data))
    
    @property
    def sentences(self):
        """
        Return a list of sentences. 
        """
        data = []
        for file in self.raw_data:
            sentences = self.splitter.tokenize(file['body'])
            data.extend([{'sentence': sentence, 'file': file['file']} for sentence in sentences])
        return data
    
    @property
    def tokenized_sentences(self):
        """
        Return a list of sublists with tokens. 
        """
        return [dict(item, **{'tokens': self.tokenizer.tokenize(item['sentence'].lower())}) for item in self.sentences]
    
    @property
    def lemmatized_sentences(self):
        """
        Return a list of sublists with lemmas. 
        """
        return [dict(item, **{'tokens': self.lemmatizer.lemmas(item['tokens'])}) for item in self.tokenized_sentences]
    
    def remove_stopwords(self, text):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        return [w for w in text if w not in self.stopwords and w.isalpha()]
    
    @property
    def lemmatized_sentences_no_stop(self):
        """
        Return a list of sublists with interesting lemmas. 
        Ergo: stopword and non-alphabetical removal. 
        """
        return [dict(item, **{'tokens': self.remove_stopwords(item['tokens'])}) for item in self.lemmatized_sentences]
    
    @property
    def clean_sentences(self):
        """
        Return a single sentence.
        """
        return [dict(item, **{'clean': " ".join(item['tokens'])}) for item in self.lemmatized_sentences_no_stop]