In [6]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

@author: jacobjohn

"""
import nltk
from nltk.corpus import stopwords
from nltk.corpus import inaugural
from nltk.corpus import wordnet as wn
from nltk.tokenize import TweetTokenizer
import tweepy

##Using Obama's inaugural speech
Obama = inaugural.words(fileids = '2009-Obama.txt')

##stopword removal
stop_words = set(stopwords.words('english')) 
filtered_sentence = [w for w in Obama if not w in stop_words]
print("After stopword removal: ", Obama)

After stopword removal:  ['My', 'fellow', 'citizens', ':', 'I', 'stand', 'here', ...]


In [28]:
##CMU wordlist
entries = nltk.corpus.cmudict.entries()
print("Number of entries: ", len(entries))
for entry in entries[10000:10025]:
    print("CMU word list: ", entry)

Number of entries:  133737
CMU word list:  ('belford', ['B', 'EH1', 'L', 'F', 'ER0', 'D'])
CMU word list:  ('belfry', ['B', 'EH1', 'L', 'F', 'R', 'IY0'])
CMU word list:  ('belgacom', ['B', 'EH1', 'L', 'G', 'AH0', 'K', 'AA0', 'M'])
CMU word list:  ('belgacom', ['B', 'EH1', 'L', 'JH', 'AH0', 'K', 'AA0', 'M'])
CMU word list:  ('belgard', ['B', 'EH0', 'L', 'G', 'AA1', 'R', 'D'])
CMU word list:  ('belgarde', ['B', 'EH0', 'L', 'G', 'AA1', 'R', 'D', 'IY0'])
CMU word list:  ('belge', ['B', 'EH1', 'L', 'JH', 'IY0'])
CMU word list:  ('belger', ['B', 'EH1', 'L', 'G', 'ER0'])
CMU word list:  ('belgian', ['B', 'EH1', 'L', 'JH', 'AH0', 'N'])
CMU word list:  ('belgians', ['B', 'EH1', 'L', 'JH', 'AH0', 'N', 'Z'])
CMU word list:  ('belgique', ['B', 'EH0', 'L', 'ZH', 'IY1', 'K'])
CMU word list:  ("belgique's", ['B', 'EH0', 'L', 'JH', 'IY1', 'K', 'S'])
CMU word list:  ('belgium', ['B', 'EH1', 'L', 'JH', 'AH0', 'M'])
CMU word list:  ("belgium's", ['B', 'EH1', 'L', 'JH', 'AH0', 'M', 'Z'])
CMU word list:  (

In [17]:
##Wordnet
id = wn.synsets('motorcar') #you get an id for subsets
id[0].lemma_names() #head words/lemmas in the subset

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [18]:
##NLTK pipeline

texts = ["""This is a sentence. So is this one."""] #paste text after the three quotes, organize into lines

for text in texts:
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_words = nltk.pos_tag(words)
        print(tagged_words)
    

[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
[('So', 'RB'), ('is', 'VBZ'), ('this', 'DT'), ('one', 'NN'), ('.', '.')]


In [7]:
##Implementing tokenization
#Twitter aware tokenizer

auth = tweepy.OAuthHandler("---", "---")
auth.set_access_token("---", "---")

api = tweepy.API(auth)

public_tweets = api.home_timeline()
tknzr = TweetTokenizer()
for tweet in public_tweets:
    print("Tweet: ",tweet.text)
    sent = nltk.sent_tokenize(tweet.text)
    print("Sentence tokenization: ", sent)
    word = nltk.word_tokenize(tweet.text)
    print("Word tokenization: ", word)
    tweett = tknzr.tokenize(tweet.text)
    print("Tweet tokenized: ",tweett)
    print("\n")

Tweet:  Andy Murray meets fellow injury sufferer James Duckworth at US Open https://t.co/Lds2CiW2V2
Sentence tokenization:  ['Andy Murray meets fellow injury sufferer James Duckworth at US Open https://t.co/Lds2CiW2V2']
Word tokenization:  ['Andy', 'Murray', 'meets', 'fellow', 'injury', 'sufferer', 'James', 'Duckworth', 'at', 'US', 'Open', 'https', ':', '//t.co/Lds2CiW2V2']
Tweet tokenized:  ['Andy', 'Murray', 'meets', 'fellow', 'injury', 'sufferer', 'James', 'Duckworth', 'at', 'US', 'Open', 'https://t.co/Lds2CiW2V2']


Tweet:  Srinagar hotel case: Major Leetul Gogoi found guilty, to face court martial https://t.co/ExJeEEMwIR https://t.co/2zcMUnuUfn
Sentence tokenization:  ['Srinagar hotel case: Major Leetul Gogoi found guilty, to face court martial https://t.co/ExJeEEMwIR https://t.co/2zcMUnuUfn']
Word tokenization:  ['Srinagar', 'hotel', 'case', ':', 'Major', 'Leetul', 'Gogoi', 'found', 'guilty', ',', 'to', 'face', 'court', 'martial', 'https', ':', '//t.co/ExJeEEMwIR', 'https', ':', 