In [0]:
text_data = ["Last day for Christmas delivery is today or this week, depending on where you shop online, according to our Christmas shipping deadlines explainer below.",
            "Walmart is also offering free 2-day shipping on a large selection of items and they just released deals on gifts that you can pickup in-store. You can also place orders online up until 4 PM Sunday and pick those items up in-store until 5 PM on Christmas Eve. "]

In [134]:
import nltk
import nltk.tokenize 
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [136]:
print(nltk.tokenize.wordpunct_tokenize(text_data[1]))
print(nltk.tokenize.sent_tokenize(text_data[1]))


['Walmart', 'is', 'also', 'offering', 'free', '2', '-', 'day', 'shipping', 'on', 'a', 'large', 'selection', 'of', 'items', 'and', 'they', 'just', 'released', 'deals', 'on', 'gifts', 'that', 'you', 'can', 'pickup', 'in', '-', 'store', '.', 'You', 'can', 'also', 'place', 'orders', 'online', 'up', 'until', '4', 'PM', 'Sunday', 'and', 'pick', 'those', 'items', 'up', 'in', '-', 'store', 'until', '5', 'PM', 'on', 'Christmas', 'Eve', '.']
['Walmart is also offering free 2-day shipping on a large selection of items and they just released deals on gifts that you can pickup in-store.', 'You can also place orders online up until 4 PM Sunday and pick those items up in-store until 5 PM on Christmas Eve.']


In [0]:

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [0]:
lm = WordNetLemmatizer()

In [138]:
print(list(map(lm.lemmatize, nltk.tokenize.wordpunct_tokenize(text_data[1]))))


['Walmart', 'is', 'also', 'offering', 'free', '2', '-', 'day', 'shipping', 'on', 'a', 'large', 'selection', 'of', 'item', 'and', 'they', 'just', 'released', 'deal', 'on', 'gift', 'that', 'you', 'can', 'pickup', 'in', '-', 'store', '.', 'You', 'can', 'also', 'place', 'order', 'online', 'up', 'until', '4', 'PM', 'Sunday', 'and', 'pick', 'those', 'item', 'up', 'in', '-', 'store', 'until', '5', 'PM', 'on', 'Christmas', 'Eve', '.']


In [97]:
sm = PorterStemmer()
print(list(map(sm.stem, nltk.tokenize.wordpunct_tokenize(text_data[1]))))

['walmart', 'is', 'also', 'offer', 'free', '2', '-', 'day', 'ship', 'on', 'a', 'larg', 'select', 'of', 'item', 'and', 'they', 'just', 'releas', 'deal', 'on', 'gift', 'that', 'you', 'can', 'pickup', 'in', '-', 'store', '.', 'you', 'can', 'also', 'place', 'order', 'onlin', 'up', 'until', '4', 'PM', 'sunday', 'and', 'pick', 'those', 'item', 'up', 'in', '-', 'store', 'until', '5', 'PM', 'on', 'christma', 'eve', '.']


In [139]:
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'so', 'being', 'myself', 're', "wasn't", 'just', 'shan', "you'd", 'her', 'hers', 'most', 'while', 'at', 'again', "shouldn't", "won't", 'the', 'his', 'my', 'their', 'will', 'ain', "she's", 'all', 'd', 'o', "mustn't", 'ourselves', 'wasn', 'been', 'shouldn', 'him', 'our', 'are', 'having', 'an', 'themselves', 'he', 'himself', 'now', 've', 'herself', 'in', 'you', 'below', "wouldn't", 'with', "isn't", 'couldn', "mightn't", 'yourself', "doesn't", 'no', 'which', 'about', 'have', 'only', 'ours', "you're", 'had', 'against', 'over', 'not', 'that', 'me', 'these', 'haven', 'am', 'after', 'we', 'when', "hadn't", 'on', 't', 'nor', 'why', "needn't", 'did', 'as', 'few', "you've", "weren't", 'doing', 'm', 'each', 'more', 'needn', 'has', 'then', 'any', 'isn', "you'll", 'a', 'but', 'hasn', 'yourselves', 'too', 'mustn', 'other', "should've", 'those', 'by', "didn't", 'them', 'from', 'some'

In [0]:
docs = list(map(nltk.tokenize.wordpunct_tokenize, text_data))
docs_filtered = [list(map(sm.stem, filter(lambda x: (x not in string.punctuation) and (x not in stop_words), doc))) for doc in docs]
# words = [list(map(nltk.tokenize.wordpunct_tokenize, doc)) for doc in docs_sent]

In [141]:
docs_filtered

[['last',
  'day',
  'christma',
  'deliveri',
  'today',
  'week',
  'depend',
  'shop',
  'onlin',
  'accord',
  'christma',
  'ship',
  'deadlin',
  'explain'],
 ['walmart',
  'also',
  'offer',
  'free',
  '2',
  'day',
  'ship',
  'larg',
  'select',
  'item',
  'releas',
  'deal',
  'gift',
  'pickup',
  'store',
  'you',
  'also',
  'place',
  'order',
  'onlin',
  '4',
  'PM',
  'sunday',
  'pick',
  'item',
  'store',
  '5',
  'PM',
  'christma',
  'eve']]

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [0]:
def my_tokenizer(doc):
    print(doc)
    tokenized = nltk.tokenize.wordpunct_tokenize(doc)
    print(tokenized)
    res = list(map(sm.stem, filter(lambda x: (x not in string.punctuation) and (x not in stop_words), tokenized)))
    print(res)
    return res

In [0]:
vect = TfidfVectorizer(analyzer='word', tokenizer=my_tokenizer)

In [152]:
freqs = vect.fit_transform(text_data)

last day for christmas delivery is today or this week, depending on where you shop online, according to our christmas shipping deadlines explainer below.
['last', 'day', 'for', 'christmas', 'delivery', 'is', 'today', 'or', 'this', 'week', ',', 'depending', 'on', 'where', 'you', 'shop', 'online', ',', 'according', 'to', 'our', 'christmas', 'shipping', 'deadlines', 'explainer', 'below', '.']
['last', 'day', 'christma', 'deliveri', 'today', 'week', 'depend', 'shop', 'onlin', 'accord', 'christma', 'ship', 'deadlin', 'explain']
walmart is also offering free 2-day shipping on a large selection of items and they just released deals on gifts that you can pickup in-store. you can also place orders online up until 4 pm sunday and pick those items up in-store until 5 pm on christmas eve. 
['walmart', 'is', 'also', 'offering', 'free', '2', '-', 'day', 'shipping', 'on', 'a', 'large', 'selection', 'of', 'items', 'and', 'they', 'just', 'released', 'deals', 'on', 'gifts', 'that', 'you', 'can', 'pickup

In [153]:
freqs.toarray()

array([[0.        , 0.        , 0.        , 0.28234951, 0.        ,
        0.401788  , 0.200894  , 0.28234951, 0.        , 0.28234951,
        0.28234951, 0.        , 0.28234951, 0.        , 0.        ,
        0.        , 0.        , 0.28234951, 0.        , 0.200894  ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.200894  , 0.28234951, 0.        ,
        0.        , 0.28234951, 0.        , 0.28234951],
       [0.16897057, 0.16897057, 0.16897057, 0.        , 0.33794115,
        0.12022395, 0.12022395, 0.        , 0.16897057, 0.        ,
        0.        , 0.16897057, 0.        , 0.16897057, 0.16897057,
        0.33794115, 0.16897057, 0.        , 0.16897057, 0.12022395,
        0.16897057, 0.16897057, 0.16897057, 0.16897057, 0.33794115,
        0.16897057, 0.16897057, 0.12022395, 0.        , 0.33794115,
        0.16897057, 0.        , 0.16897057, 0.        ]])

In [0]:
import gensim