In [3]:
import pandas as pd
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ianleefmans/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
class text_process:
    """
    Class for preprocessing text.
    
    """
    def __init__(self,text):
        """
        Args:
            text : a pd column of 'documents' to be vectorized 
            
            """
        self.text = text
        
    
    def pd_col_to_list(self):
        """
        
            Converts pandas column to a LIST of comma-separated text ('documents') before it is to be vectorized.
            For example, index 0 of the list = the first question (or answer), and so on.
        
        """
        
        doc_list = []
        for i in self.text:
            doc_list.append(i)
            
        return doc_list
        
        
    def remove_stopwords(self, doc_list):    
        """
        Args:
            doc_list: output from pd_col_to_list, a list of comma separated text ('documents')
        
            Removes stop_words, according to the stop words nltk corpus, before the list is to be vectorized.
            
        """
        
        stop_words = stopwords.words('english')
        for j in range(len(doc_list)):
            doc_list[j] = text_to_word_sequence(doc_list[j])
            for i in stop_words:
                count=0
                while count==0:
                    try:
                        doc_list[j].remove(i)
                    except:
                        count+=1
                    finally:
                        pass
        
        return doc_list
        

    def text_to_vec(self, doc_list):
        """
        Args:
            doc_list: output from remove_stopwords(), a list of comma separated text ('documents') where stop words
                       have been removed
        
        
            Converts list of comma-separated 'documents' to a vector, using tf-idf. This is done after stop words 
            have been removed (with remove_stopwords()). Utilizes the keras tokenizer, which separates on spaces, 
            makes all tokens lowercase and removes punctuation. Function then stems the tokens, using the 
            PorterStemmer from nltk. Finally, it returns the preprocessed tokens in vector form.
            
        """

        
        ## Keras tokenizer
        
        tok = Tokenizer()
        tok.fit_on_texts(doc_list)
        
        tokens = list(tok.word_counts)
        
        ## Stem words 
        
        ps = PorterStemmer()
        for i in range(len(tokens)): #i = question index (i.e. question1, question2, etc...)
            tokens[i] = (ps.stem(tokens[i]))
        
        ## Return tf-idf matrix
        
        matrx = tok.texts_to_matrix(tokens, mode='tfidf')
        
        return matrx, tok.get_config()
    
    
    def text_preprocess(self):

        list_form = text_process.pd_col_to_list(self)
        no_stops = text_process.remove_stopwords(self,list_form)
        final_vec = text_process.text_to_vec(self,no_stops)
        
        return final_vec
        
        
        


### Example Use 

(Making example questions)

In [5]:
import pandas as pd

In [6]:
questions = ['What year was pathology founded?', 'What does pathology mean?', 'Name five viral diseases',
             'Where is the liver located?', 'What does the liver do?']

### pd_col_to_list

This function may be irrelavant - can likely edit to use on pd column or delete.

In [7]:
cleantxt = text_process(questions)

In [8]:
ques_list = cleantxt.pd_col_to_list()
ques_list

['What year was pathology founded?',
 'What does pathology mean?',
 'Name five viral diseases',
 'Where is the liver located?',
 'What does the liver do?']

### remove_stopwords

Notice all stopwords have been removed and it is now a nested list of comma-separated words.

In [9]:
no_stops = cleantxt.remove_stopwords(ques_list)
no_stops

[['year', 'pathology', 'founded'],
 ['pathology', 'mean'],
 ['name', 'five', 'viral', 'diseases'],
 ['liver', 'located'],
 ['liver']]

### text_to_vec

Converts the above to a matrix, using the term-frequency inverse-document frequency (tfidf) approach. This function also stems the words before returning the matrix.

In [10]:
vec = cleantxt.text_to_vec(no_stops)
vec

(array([[0.        , 0.        , 0.        , 1.25276297, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         1.25276297, 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 1.25276297, 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.25276297, 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.     

Matrix only:

In [15]:
vec[0]

array([[0.        , 0.        , 0.        , 1.25276297, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.25276297, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.25276297, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.25276297, 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.

Dictionary of all counts/values of matrix:

In [130]:
vec[1]

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 5,
 'word_counts': '{"year": 1, "pathology": 2, "founded": 1, "mean": 1, "name": 1, "five": 1, "viral": 1, "diseases": 1, "liver": 2, "located": 1}',
 'word_docs': '{"founded": 1, "pathology": 2, "year": 1, "mean": 1, "five": 1, "viral": 1, "diseases": 1, "name": 1, "located": 1, "liver": 2}',
 'index_docs': '{"4": 1, "1": 2, "3": 1, "5": 1, "7": 1, "8": 1, "9": 1, "6": 1, "10": 1, "2": 2}',
 'index_word': '{"1": "pathology", "2": "liver", "3": "year", "4": "founded", "5": "mean", "6": "name", "7": "five", "8": "viral", "9": "diseases", "10": "located"}',
 'word_index': '{"pathology": 1, "liver": 2, "year": 3, "founded": 4, "mean": 5, "name": 6, "five": 7, "viral": 8, "diseases": 9, "located": 10}'}

### Alternatively, you can just use text_preprocess in one step:

Same result as above, it just returns the final matrix.

In [13]:
cleantxt.text_preprocess()

(array([[0.        , 0.        , 0.        , 1.25276297, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         1.25276297, 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 1.25276297, 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.25276297, 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.     