In [1]:
# open the new dataset

import codecs, nltk

article = codecs.open("../datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()

print (article)

Advertisement By JACEY FORTINDEC. 31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage. (Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.) In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo. In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight. Big pots of tea were prepared for New Year’s Eve celebrations in Beijing. The country will also celebrate the Lunar New Year, in February. It was raining in Singapore, but New Year’s Eve celebrants sheltered under umbrellas and raincoats as fireworks sparkled overhead. Tourists donned party hats to watch fireworks in front of the famous Petronas Twin Towers in Kuala Lumpur, Malaysia. Hundreds of couples got married at a mass wedding in Jakarta on New Year’s Eve. We’re interested

In [11]:
# split into sentences
sentences = nltk.sent_tokenize(article) 

# take one single sentence 

sentence = sentences[1]

print (sentence) # type = string (text)

<class 'str'>


In [12]:
# word tokenizer
tokenized_sentence = nltk.word_tokenize(sentence)

print (tokenized_sentence) # type = list; each item is now a token, which is our unit of analysis. Numbers are seen as strings, so must tell it that they are numbers

<class 'list'>


In [13]:
# lowering words

lowercased_sentence = [word.lower() for word in tokenized_sentence]
# Why useful? Depends on what you want to do...if you want to keep Bush and bush separate, don't lowercase.
print (lowercased_sentence)

['31', ',', '2017', 'in', 'sydney', ',', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'harbour', 'bridge', 'in', 'celebration', 'of', 'australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [5]:
# without punctuation

import string

# defining punctuation to be removed
punctuation = list(string.punctuation)

print (punctuation)


['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [14]:
without_punct_sentence = [token for token in lowercased_sentence if token not in punctuation]
# be careful not to remove punctuation that you care about, e.g. @ symbol for symbols or other things important for the type of text you're analyzing
print (without_punct_sentence)

# missing apostrophe from punctuation string list...

['31', '2017', 'in', 'sydney', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'harbour', 'bridge', 'in', 'celebration', 'of', 'australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage']


In [18]:
# getting stopword list from nltk
# check here for the download: https://stackoverflow.com/questions/41610543/corpora-stopwords-not-found-when-import-nltk-library

from nltk.corpus import stopwords

stop_word_list = stopwords.words('english')

print (stop_word_list)

# stop words depend on you and what you want to do; if in your context you care about these words, define your own manual stop words or don't remove stop words
# make sure to clarify in paper you write about what the stop words are; don't just use frequent words, be careful how you do this

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
# removing stopwords

without_stopwords_sentence = [word for word in without_punct_sentence if word not in stop_word_list]

print (without_stopwords_sentence)


['31', '2017', 'sydney', 'rainbow', 'fireworks', 'sparkled', 'harbour', 'bridge', 'celebration', 'australia', '’', 'recent', 'legalization', 'gay', 'marriage']


In [9]:
# keeping words (alpha is a "word" not a number)

only_words_sentence = [word for word in without_stopwords_sentence if word.isalpha()]

print (only_words_sentence)


['sydney', 'rainbow', 'fireworks', 'sparkled', 'harbour', 'bridge', 'celebration', 'australia', 'recent', 'legalization', 'gay', 'marriage']


In [30]:
# stemming

# import the library
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer("english")

#print(type(snowball_stemmer))

# list comprehension command
# take each word that we have, take list and one after the other, teach each token and stem them

stem_sentence = [snowball_stemmer.stem(word) for word in only_words_sentence]

print (stem_sentence)

['sydney', 'rainbow', 'firework', 'sparkl', 'harbour', 'bridg', 'celebr', 'australia', 'recent', 'legal', 'gay', 'marriag']


In [37]:
# lemmatization

#from nltk.stem.wordnet import WordNetLemmatizer

import nltk
nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()

lemma_sent = [wordnet_lemmatizer.lemmatize(word) for word in only_words_sentence]

print (lemma_sent)

[nltk_data] Downloading package wordnet to /Users/Jess/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['sydney', 'rainbow', 'firework', 'sparkled', 'harbour', 'bridge', 'celebration', 'australia', 'recent', 'legalization', 'gay', 'marriage']


In [38]:
#POS tagging
# http://www.nltk.org/book/ch05.html

# we need to use the original sentence as a list of tokens; NO PREPROCCESSING AT ALL! Don't touch or remove anything when you do this

tokenized_sentence = nltk.word_tokenize(sentence)
print(tokenized_sentence)

['31', ',', '2017', 'In', 'Sydney', ',', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'Harbour', 'Bridge', 'in', 'celebration', 'of', 'Australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [45]:
# you use the pos-tagger (it gives you back a list of tuples (word,pos = part of speech))
import nltk
nltk.download('averaged_perceptron_tagger')

pos_sentence = nltk.pos_tag(tokenized_sentence)

print (pos_sentence)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Jess/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[('31', 'CD'), (',', ','), ('2017', 'CD'), ('In', 'IN'), ('Sydney', 'NNP'), (',', ','), ('rainbow', 'NN'), ('fireworks', 'NNS'), ('sparkled', 'VBD'), ('off', 'RP'), ('the', 'DT'), ('Harbour', 'NNP'), ('Bridge', 'NNP'), ('in', 'IN'), ('celebration', 'NN'), ('of', 'IN'), ('Australia', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('recent', 'JJ'), ('legalization', 'NN'), ('of', 'IN'), ('gay', 'JJ'), ('marriage', 'NN'), ('.', '.')]


In [None]:
# combining lemmatization and pos tagging

lemmas = []

for word,pos in pos_sentence:
    # we lower-case the word (for lemmatization)
    
    # if it's a verb - then we tell that to the lemmatizer
    if 
        lemma =
    else:
    # otherwise, work as usual
        lemma =
    # we append the results
    lemmas.append(lemma)

print (lemmas)


#homework - try to write this with a list comprehension