In [2]:
from eigen_tech_project.nlp_models import nltk_pipe

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

## Load in the data:

In [1]:
from os import listdir
from os.path import isfile, join, abspath
import pandas as pd
import re


class DataReaderParser:
    def __init__(self, path):
        self.path = path
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        
    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return("{}({!r})".format(self.__class__.__name__, self.path))
    
    @property
    def file_names(self):
        '''Returns representation of the DataLoader object'''
        return [f for f in listdir(self.path) if isfile(join(self.path, f))]
    
    @property
    def raw_data(self):
        '''Returns representation of the DataLoader object'''
        return [{"file": int(re.sub("[^0-9]", "", f)), 
                 "body": open(abspath(join(self.path, f)), 'r').read()} for f in self.file_names]
    
    @property
    def sentences(self):
        '''Returns representation of the DataLoader object'''  
        data = []
        for file in self.raw_data:
            sentences = self.splitter.tokenize(file['body'])
            data.extend([{'sentence': sentence, 'file': file['file']} for sentence in sentences])
            
        return data
    
    

## Document->Sentences->Tokens->POS->Lemmas

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#example text text = 'What can I say about this place. The staff of these restaurants is nice and the eggplant is not bad'

class LemmatizationWithPOSTagger:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return("{}({!r})".format(self.__class__.__name__))
    
    def get_wordnet_pos(self, treebank_tag):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN
        
    def get_lemma(self, word_postag_combo):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        return self.lemmatizer.lemmatize(word_postag_combo[0], self.get_wordnet_pos(word_postag_combo[1]))

    def lemmas(self, tokenized_sentence):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
        # find the pos tagging for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = nltk.pos_tag(tokenized_sentence)
        # lemmatization using pos tags
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        lemmas = [self.get_lemma(word_tag_combo) for word_tag_combo in pos_tokens]
        return lemmas
    

In [33]:

import requests

class Sentence:
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self, sentence):
        self.sentence = sentence
#         self.tokenizer = nltk.tokenize.TreebankWordTokenizer()
#         self.tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
        # following tokenizer also removes punctuation:
        self.tokenizer = nltk.RegexpTokenizer(r"\w+")
        self.lemmatizer = LemmatizationWithPOSTagger()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.common_words_url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"

    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return ("{}({!r})".format(self.__class__.__name__, self.sentence))
    
    @property
    def tokenized_sentence(self):
        """
        Return a list of sublists with tokens. 
        """
        # tokenization in each sentences
        return self.tokenizer.tokenize(self.sentence.lower())
    
    @property
    def lemmatized_sentence(self):
        """
        Return a list of sublists with lemmas. 
        """
        return self.lemmatizer.lemmas(self.tokenized_sentence)
    
    def remove_stopwords(self, text):
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
        """
#         common_words = requests.get(self.common_words_url).text.split()
#         noise = self.stopwords + common_words
        return [w for w in text if w not in self.stopwords and w.isalpha()]
    
    @property
    def lemmatized_sentence_no_stop(self):
        """
        Return a list of sublists with interesting lemmas. 
        Ergo: stopword and non-alphabetical removal. 
        """
        return self.remove_stopwords(self.lemmatized_sentence)
    
    @property
    def clean_sentence(self):
        """
        Return a single sentence.
        """
        return " ".join(self.lemmatized_sentence_no_stop)

In [35]:
common_words_url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_words = requests.get(common_words_url).text.split()

In [411]:
df['clean_sentence'] = df.sentence.apply(lambda x: Sentence(x).clean_sentence)

## Construct the countvectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

class InvertedIndex:
    def __init__(self, corpus):
        self.corpus = corpus
        self.countvectorizer = CountVectorizer()
    
    def __repr__(self):
        '''Returns representation of the DataLoader object'''
        return("{}({!r})".format(self.__class__.__name__, self.corpus))
    
    @property
    def inverted_index(self):
        """Return list with the top x most occuring interesting words, with following elements: (feature_id, occurence)."""
        
        vectorizer = self.countvectorizer.fit(self.corpus)
        document_term_matrix = vectorizer.transform(self.corpus)
        
        lemmas = vectorizer.get_feature_names()
        indices = range(0,len(lemmas))
        frequencies = np.asarray(document_term_matrix.toarray().sum(axis=0))
        occurences = [document_term_matrix[:,i].nonzero()[0].tolist() for i in indices]
        
        return [{'lemma': lemmas[i], 
                 'frequency': frequencies[i], 
                 'occurences': occurences[i]} for i in indices]
    
    
    

In [34]:
x = DataReaderParser('data/').sentences
y = [dict(item, **{'clean': Sentence(item["sentence"]).clean_sentence}) for item in x]

df_input = pd.DataFrame(y).reset_index()
inverted_index = InvertedIndex(df_input.clean).inverted_index
df_ii = pd.DataFrame(inverted_index).sort_values('frequency', ascending=False)

df_ii["sentences"] = df_ii.occurences.apply(lambda x: df_input.iloc[x].sentence.tolist())
df_ii["documents"] = df_ii.occurences.apply(lambda x: list(set(df_input.iloc[x].file)))
df_ii = df_ii.drop(["occurences"], axis=1).reset_index(drop=True)

In [27]:
df_ii[:20]

Unnamed: 0,lemma,frequency,sentences,documents
0,american,73,"[But, when I think about what is at stake I am...","[1, 2, 3, 4, 5, 6]"
1,iraq,64,"[A few Tuesdays ago, the American people embra...","[1, 2, 3, 5]"
2,america,54,"[A few Tuesdays ago, the American people embra...","[1, 2, 3, 4, 5]"
3,government,47,[And the Kiev story is heading in the right di...,"[1, 2, 3, 4, 5, 6]"
4,promise,40,[One statistic powerfully describes this unful...,"[1, 2, 3, 4]"
5,threat,35,"[Now, few people understand these challenges b...","[2, 3, 5, 6]"
6,iraqi,34,[We have been told that progress is just aroun...,"[1, 2, 5]"
7,weapon,31,"[As some of you know, Senator Lugar and I rece...","[1, 2, 4, 5, 6]"
8,kenya,31,"[The first time I came to Kenya was in 1987., ...","[2, 3, 4]"
9,troop,30,[Besides the devastation they can cause to a c...,"[1, 2, 3, 5, 6]"


In [442]:
x = CountVectorizer()



y = x.fit(corpus)

z = y.transform(corpus)

In [448]:
z.get_feature_names()

AttributeError: get_feature_names not found

In [21]:
import requests


x = 

In [285]:
x = DataLoader("data/")

df = x.df()

In [287]:
y = TextParser(text=df.body[0])

In [290]:
len(y.tokenized_sentences)

91

In [291]:
len(y.lemmatized_sentences)

91