In [160]:
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [161]:
wp_tokenizer = WordPunctTokenizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL', 'N LEMMATIZER', 'V LEMMATIZER']

In [162]:
formatted_text = '{:>15}' * (len(stemmer_names)+1)

In [163]:
print('\n', formatted_text.format('INPUT WORD', *stemmer_names), 
     '\n', '='*85)


      INPUT WORD         PORTER      LANCASTER       SNOWBALL   N LEMMATIZER   V LEMMATIZER 


In [164]:
input_words=['hello', 'found', 'trying', 'ran', 'called', 'code']
for word in input_words:
    output = [word, porter.stem(word), lancaster.stem(word), snowball.stem(word),
             lemmatizer.lemmatize(word, pos='n'), lemmatizer.lemmatize(word, pos='v')]
    print(formatted_text.format(*output))

          hello          hello          hello          hello          hello          hello
          found          found          found          found          found           find
         trying            tri            try            tri         trying            try
            ran            ran            ran            ran            ran            run
         called           call            cal           call         called           call
           code           code            cod           code           code           code


## Chunking

In [165]:
import numpy as np
from nltk.corpus import brown

In [203]:
def chunker(input_data, N):
    input_words = input_data.split(' ')
    output=[]
    cur_chunk = []
    count=0
    for word in input_words:
        cur_chunk.append(word)
        count += 1
        if count == N:
            output.append(' '.join(cur_chunk))
            count, cur_chunk = 0, []
    if len(cur_chunk) > 0:
        output.append(' '.join(cur_chunk))
    return output

In [204]:
input_data  = ' '.join(brown.words()[:5000])

In [205]:
chunk_size = 1000
text_chunks = chunker(input_data, chunk_size)
#print(text_chunks[:1])

## Bag of words model

In [206]:
from sklearn.feature_extraction.text import CountVectorizer
chunks=[]
for count, chunk in enumerate(text_chunks):
    print(count)
    d = {'index': count, 'text': chunk}
    chunks.append(d)
print('total words', len(chunks))
print(chunks[-1])

0
1
2
3
4
total words 5
{'index': 4, 'text': "the Education courses . Fifty-three of the 150 representatives immediately joined Grover as co-signers of the proposal . Paris , Texas ( sp. ) -- The board of regents of Paris Junior College has named Dr. Clarence Charles Clark of Hays , Kan. as the school's new president . Dr. Clark will succeed Dr. J. R. McLemore , who will retire at the close of the present school term . Dr. Clark holds an earned Doctor of Education degree from the University of Oklahoma . He also received a Master of Science degree from Texas A & I College and a Bachelor of Science degree from Southwestern State College , Weatherford , Okla. . In addition , Dr. Clark has studied at Rhode Island State College and Massachusetts Institute of Technology . During his college career , Dr. Clark was captain of his basketball team and was a football letterman . Dr. Clark has served as teacher and principal in Oklahoma high schools , as teacher and athletic director at Raymondvi

In [207]:
count_vectorizer = CountVectorizer(min_df=5, max_df=25)
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])
vocabulary = np.array(count_vectorizer.get_feature_names())
print('\nVocabulary:', vocabulary.shape)


Vocabulary: (39,)


In [208]:
print('document shape', document_term_matrix.shape)
print(type(document_term_matrix))

document shape (5, 39)
<class 'scipy.sparse.csr.csr_matrix'>


In [209]:
chunk_names =[]
for i in range(len(text_chunks)):
    chunk_names.append('C' + str(i+1))
formatted_text = '{:>12}' * (len(chunk_names) + 1)

print('\n', formatted_text.format('Word', *chunk_names))
for word, item in zip(vocabulary,document_term_matrix.T):
    # item is a csr_matrix
    #print('item shape', item.shape)
    #print('item data shape', item.data.shape)
    output = [word] + [str(freq) for freq in item.data]
    print(formatted_text.format(*output))


         Word          C1          C2          C3          C4          C5
        also           2           1           2           1           1
          an           4           1           3           4           3
         and          27           7          16          11          21
         are           3           2           1           2           3
          as           7           6           2           2          11
          at           4           1           6           1           7
          be           7          13           7           7           2
          by           4           4           4          18           7
      county           6           7           4           2           2
         for          10          12           8          11           7
        from           3           1           2           3           4
         had           2           1           1           1           1
         has           6           4           1 

## Category Predictor

In [210]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [211]:
category_map={'talk.politics.misc':'Politics',
             'rec.autos': 'Autos',
             'rec.sport.hockey': 'Hockey',
              'sci.electronics': 'Electronics',
              'sci.med': 'Mecicine'
             }

In [212]:
training_data = fetch_20newsgroups(subset='train',
                                  categories=category_map.keys(),
                                  shuffle=True, random_state=5)

In [213]:
count_vectorizer=CountVectorizer()
train_tc=count_vectorizer.fit_transform(training_data.data)
print('traing data shape', train_tc.shape)
print(set(training_data.target))

traing data shape (2844, 40321)
{0, 1, 2, 3, 4}


In [214]:
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)

In [215]:
classifier = MultinomialNB().fit(train_tfidf, training_data.target)

In [216]:
input_data = [
    'Mobile computing is everywhere',
    'Players work hard to win the game',
    'A president needs to pay attention to people need',
    'When driving on a high way, be careful!'
]

In [155]:
input_tc = count_vectorizer.transform(input_data)
input_tfidf = tfidf.transform(input_tc)
predictions = classifier.predict(input_tfidf)

In [156]:
for sent, category in zip(input_data, predictions):
    print('Input:', sent, '\nPreditect category:',
         category_map[training_data.target_names[category]])

Input: Mobile computing is everywhere 
Preditect category: Electronics
Input: Players work hard to win the game 
Preditect category: Hockey
Input: A president needs to pay attention to people need 
Preditect category: Politics
Input: When driving on a high way, be careful! 
Preditect category: Autos
