In [4]:
import pandas as pd
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

Using TensorFlow backend.


In [5]:
class text_process:
    """
    Class for preprocessing text.
    
    """
    def __init__(self,text):
        """
        Args:
            text : a pd column of 'documents' to be vectorized 
            
            """
        self.text = text
        
    
    def pd_col_to_list(self):
        """
        
            Converts pandas column to a LIST of comma-separated text ('documents') before it is to be vectorized.
            For example, index 0 of the list = the first question (or answer), and so on.
        
        """
        
        doc_list = []
        for i in self.text:
            doc_list.append(i)
            
        return doc_list
        
        
    def remove_stopwords(self, doc_list):    
        """
        Args:
            doc_list: output from pd_col_to_list, a list of comma separated text ('documents')
        
            Removes stop_words, according to the stop words nltk corpus, before the list is to be vectorized.
            
        """
        
        stop_words = stopwords.words('english')
        for j in range(len(doc_list)):
            doc_list[j] = text_to_word_sequence(doc_list[j])
            for i in stop_words:
                count=0
                while count==0:
                    try:
                        doc_list[j].remove(i)
                    except:
                        count+=1
                    finally:
                        pass
        
        return doc_list
        

    def text_to_vec(self, doc_list):
        """
        Args:
            doc_list: output from remove_stopwords(), a list of comma separated text ('documents') where stop words
                       have been removed
        
        
            Converts list of comma-separated 'documents' to a vector, using tf-idf. This is done after stop words 
            have been removed (with remove_stopwords()). Utilizes the keras tokenizer, which separates on spaces, 
            makes all tokens lowercase and removes punctuation. Function then stems the tokens, using the 
            PorterStemmer from nltk. Finally, it returns the preprocessed tokens in vector form.
            
        """

        
        ## Keras tokenizer
        
        tok = Tokenizer()
        tok.fit_on_texts(doc_list)
        
        tokens = list(tok.word_counts)
        
        ## Stem words 
        
        ps = PorterStemmer()
        for i in range(len(tokens)): #i = question index (i.e. question1, question2, etc...)
            tokens[i] = (ps.stem(tokens[i]))
        
        ## Return tf-idf matrix
        
        matrx = tok.texts_to_matrix(tokens, mode='tfidf')
        
        return matrx
    
    
    def text_preprocess(self):

        list_form = text_process.pd_col_to_list(self)
        no_stops = text_process.remove_stopwords(self,list_form)
        final_vec = text_process.text_to_vec(self,no_stops)
        
        return final_vec
        
        
        


### NEED TO FIGURE OUT STOP WORDS PART (HASHED OUT ABOVE)

Need a method to transform pandas df columns to one big list before doing preprocessing

In [69]:
fin = text_process(tsting)

In [56]:
fin.text_preprocess()

array([[0.        , 0.91629073, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.91629073, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.91629073,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.91629073, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [70]:
mylist = fin.pd_col_to_list()

In [71]:
mylist2 = fin.remove_stopwords(mylist)

In [77]:
mat =fin.text_to_vec(mylist2)

In [78]:
mat

array([[0.        , 0.91629073, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.91629073, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.91629073,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.91629073, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [73]:
mylist2

[['first', 'question'],
 ['testing', 'token', 'splitting'],
 ['list', 'documents', 'counting']]

In [80]:
tok.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 13,
 'word_counts': '{"question1": 2, "question2": 2, "question3": 2, "question4": 2, "question5": 2, "this": 1, "is": 2, "the": 1, "first": 1, "question": 1, "i": 2, "am": 2, "testing": 1, "for": 1, "token": 1, "splitting": 1, "here": 1, "my": 1, "list": 1, "of": 1, "documents": 1, "counting": 1}',
 'word_docs': '{"question1": 2, "question2": 2, "question3": 2, "question4": 2, "question5": 2, "the": 1, "question": 1, "is": 2, "first": 1, "this": 1, "for": 1, "splitting": 1, "token": 1, "testing": 1, "am": 2, "i": 2, "list": 1, "counting": 1, "documents": 1, "here": 1, "my": 1, "of": 1}',
 'index_docs': '{"1": 2, "2": 2, "3": 2, "4": 2, "5": 2, "10": 1, "12": 1, "6": 2, "11": 1, "9": 1, "14": 1, "16": 1, "15": 1, "13": 1, "8": 2, "7": 2, "19": 1, "22": 1, "21": 1, "17": 1, "18": 1, "20": 1}',
 'index_word': '{"1": "question1

In [57]:
unq = set(tsting)

In [58]:
unq

{'here is my list of documents I am counting',
 'i am testing for token splitting',
 'this is the first question'}

In [59]:
len(unq)

3

In [4]:
tok = Tokenizer()

In [7]:
tok.fit_on_texts(qu)

In [12]:
lst = list(tok.word_counts)

In [13]:
lst

['question1', 'question2', 'question3', 'question4', 'question5']

In [68]:
tsting = ['this is the first question','i am testing for token splitting','here is my list of documents I am counting']

In [15]:
tok.fit_on_texts(tsting)

In [16]:
lst2 = list(tok.word_counts)

In [17]:
lst2

['question1',
 'question2',
 'question3',
 'question4',
 'question5',
 'this',
 'is',
 'the',
 'first',
 'question',
 'i',
 'am',
 'testing',
 'for',
 'token',
 'splitting',
 'here',
 'my',
 'list',
 'of',
 'documents',
 'counting']

In [20]:
for i in range(len(lst2)): #i = question index (i.e. question1, question2, etc...)
            lst2[i] = (ps.stem(lst2[i]))

In [19]:
ps = PorterStemmer()

In [21]:
lst2

['question1',
 'question2',
 'question3',
 'question4',
 'question5',
 'thi',
 'is',
 'the',
 'first',
 'question',
 'i',
 'am',
 'test',
 'for',
 'token',
 'split',
 'here',
 'my',
 'list',
 'of',
 'document',
 'count']

In [5]:
qu = ['question1','question2','question3','question4','question5']

In [12]:
an = ['answer1','answer2','answer3','answer4','answer5']

In [13]:
cols = ['Questions','Answers']

In [16]:
ex = pd.DataFrame(zip(qu,an),columns=cols)

In [17]:
ex

Unnamed: 0,Questions,Answers
0,question1,answer1
1,question2,answer2
2,question3,answer3
3,question4,answer4
4,question5,answer5


In [27]:
ex['Questions']

0    question1
1    question2
2    question3
3    question4
4    question5
Name: Questions, dtype: object

In [28]:
ex['Answers']

0    answer1
1    answer2
2    answer3
3    answer4
4    answer5
Name: Answers, dtype: object

In [30]:
doc_list = []
for i in ex['Questions']:
    doc_list.append(i)

In [32]:
doc_list[0]

'question1'

In [57]:
my = text_process(ex['Questions'])

In [58]:
my.pd_col_to_list(ex['Questions'])

['question1', 'question2', 'question3', 'question4', 'question5']

In [49]:
my2 = text_process(ex['Answers'])

In [50]:
my2.pd_col_to_list(ex['Answers'])

['answer1', 'answer2', 'answer3', 'answer4', 'answer5']

In [59]:
my.text_to_vec()

(array([[0.        , 1.25276297, 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 1.25276297, 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 1.25276297, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 1.25276297,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         1.25276297]]),
 OrderedDict([('question1', 1),
              ('question2', 1),
              ('question3', 1),
              ('question4', 1),
              ('question5', 1)]),
 5,
 {'question1': 1,
  'question2': 2,
  'question3': 3,
  'question4': 4,
  'question5': 5},
 defaultdict(int,
             {'question1': 1,
              'question2': 1,
              'question3': 1,
              'question4': 1,
              'question5': 1}))

In [54]:
my2.text_to_vec()

array([[0.        , 1.25276297, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.25276297, 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 1.25276297, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.25276297,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.25276297]])

In [62]:
stop_words = stopwords.words('english')

In [63]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [3]:
tst = my2.pd_col_to_list(ex['Answers'])

NameError: name 'my2' is not defined

In [180]:
tst2 = ['i me me myself this is the first question ours','this is the second','here we go','last question']

In [138]:
tst2

['i me me myself this is the first question ours',
 'this is the second',
 'here we go',
 'last question']

In [139]:
text_to_word_sequence(tst2[0])

['i', 'me', 'me', 'myself', 'this', 'is', 'the', 'first', 'question', 'ours']

In [142]:
lis = [1,2,3,4]
lis.remove(4)
lis[0] = 3
lis


[3, 2, 3]

In [177]:
########### Ian's test
for j in range(len(tst2)):
    tst2[j] = text_to_word_sequence(tst2[j])
    for i in stop_words:
        count=0
        while count==0:
            try:
                tst2[j].remove(i)
            except:
                count+=1
            finally:
                pass
        
        
            
    

###########

In [146]:
'myself' in stop_words

True

In [148]:
tst2[0][0]

'first'

In [174]:
for i in range(len(tst2)):  ## loop through questions
    tst2[i] = text_to_word_sequence(tst2[i])
    for j in range(len(stop_words)): ## loop thru stop words
        for p in range(len(tst2[i])): ## loop thru every word in every question
            if stop_words[j] != tst2[i]:
                tst2[i][p] = tst2[i][p]

In [175]:
tst2

[['i', 'me', 'me', 'myself', 'this', 'is', 'the', 'first', 'question', 'ours'],
 ['this', 'is', 'the', 'second'],
 ['here', 'we', 'go'],
 ['last', 'question']]

In [None]:
for i in range(len(docs)):
    doc_tokns = text_to_word_sequence(tst2[i])

In [134]:
stop_words[0] not in tkns[0]

False

In [136]:
tkns[0]

'this'

In [161]:
tst2[0]

'i me me myself this is the first question ours'

In [164]:
toks

['last', 'question']

In [165]:
toks = text_to_word_sequence(tst2[2])

In [166]:
toks

['here', 'we', 'go']

In [167]:
tst2[2] = text_to_word_sequence(tst2[2])

In [169]:
tst2[2]

'here'

In [172]:
tst2[0]

'i me me myself this is the first question ours'

In [178]:
'i' in 'this'

True

In [191]:
tst2 = ['i me me myself this is the first question ours','this is the second','here we go','last question']

In [190]:
for w in range(len(tst2)): #w = review index
    tst2[w] = text_to_word_sequence(tst2[w])
    count=list(range(len(tst2[w])))
    for i in count: #i = word index
        if tst2[w][i].lower() in stop_words:
            tst2[w].remove(tst2[w][i])
            del count[-1]
        else:
            pass

IndexError: list index out of range

In [194]:
len(tst2[0])

46

In [187]:
len(tst2[0])

5

In [129]:
tkns

['this', 'is', 'the', 'second']

In [122]:
len(stop_words)

179

In [127]:
len(tstn2)

9

In [131]:
tkns

['this', 'is', 'the', 'second']

In [79]:
tst2[0] in stop_words

False

In [81]:
stop_words[0] in tst2[0]

True

In [93]:
stop_words[1]

'me'

In [98]:
tst2[0].index('is')

14

In [101]:
tsting = list(tst2[0].split(" "))

In [102]:
tsting

['i', 'me', 'myself', 'this', 'is', 'the', 'first', 'question', 'ours']

In [110]:
tstn2 = text_to_word_sequence(tst2[0])

In [111]:
tstn2

['i', 'me', 'me', 'myself', 'this', 'is', 'the', 'first', 'question', 'ours']

In [112]:
stop_words[1] in tstn2

True

In [113]:
tstn2.index(stop_words[1])

1

In [115]:
tstn2.remove(stop_words[1])
tstn2.index(stop_words[1])

1

In [116]:
tstn2

['i', 'me', 'myself', 'this', 'is', 'the', 'first', 'question', 'ours']

In [120]:
tstn2[1] == stop_words[1]

True

In [None]:
my_tst = text_process(tst)

In [130]:
text_process1 = text_process(tst)

In [153]:
text_process1.text_to_vec()[0]

array([0.        , 1.8601123 , 0.        , 0.        , 0.        ,
       1.09861229, 1.09861229, 1.09861229, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [144]:
lent = set(tst)

In [146]:
lent

{'glorifying, glory,glory-hole, glori',
 'hello i am nyla running jumping walking lovely gorg',
 'hi hello beautifully beautiful beautifi',
 'hi there young lady! where ya running?'}

In [141]:
len(tst[0])

35

In [55]:
tst = ['glorifying, glory,glory-hole, gloriful','hi hello beautifully beautiful beautifying','hello i am nyla running jumping walking lovely gorgeous', 'hi there young lady! where ya running?']


In [56]:
ps.stem(tst[0])

'glorifying, glory,glory-hole, glori'

In [12]:
for i in range(len(tst)):
    print(tst[i], i)

hi 0
hello i am nyla 1
hi there youg lady! 2


In [10]:
list(range(len(tst[i])))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [62]:
len(tst)

4

In [63]:
tst[0]

'glorifying, glory,glory-hole, gloriful'

In [31]:
ps = PorterStemmer()

In [32]:
ps.stem(tst[1])

'hello i am nyla running jumping walking lovely gorg'

In [40]:
import pandas as pd

In [41]:
tst = pd.DataFrame(tst)

In [44]:
tst[0]

0                             hi hello hey now hey now
1    hello i am nyla running jumping walking lovely...
2               hi there young lady! where ya running?
Name: 0, dtype: object

In [64]:
for i in range(len(tst)): #i = question index (i.e. question1, question2, etc...)
    tst[i] = (ps.stem(tst[i]))

In [73]:
tst[0]

'glorifying, glory,glory-hole, glori'

In [76]:
stop_words = stopwords.words('english')

In [85]:
tst[2] in stop_words

False

In [86]:
tst[2][0]

'h'

In [88]:
stop_words[0] in tst[2]

True

In [91]:
stop_words[0]

'i'

In [94]:
tst[2].remove(stop_words[0])

AttributeError: 'str' object has no attribute 'remove'

In [98]:
tst[2].replace('i','')

'hello  am nyla runnng jumpng walkng lovely gorg'

In [103]:
stop_words[0] in tst[2]

True

In [101]:
stop_words[0]

'i'

In [111]:
list(tst[2]).remove(stop_words[0])

In [112]:
tst[2]

'hello i am nyla running jumping walking lovely gorg'

In [113]:
stop_words[0]

'i'

In [114]:
tst

['glorifying, glory,glory-hole, glori',
 'hi hello beautifully beautiful beautifi',
 'hello i am nyla running jumping walking lovely gorg',
 'hi there young lady! where ya running?']

In [119]:
text_to_vec(tst)

NameError: name 'text_to_vec' is not defined