In [2]:
# open the new dataset

import codecs, nltk

article = codecs.open("../datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()

print (article)

Advertisement By JACEY FORTINDEC. 31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage. (Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.) In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo. In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight. Big pots of tea were prepared for New Year’s Eve celebrations in Beijing. The country will also celebrate the Lunar New Year, in February. It was raining in Singapore, but New Year’s Eve celebrants sheltered under umbrellas and raincoats as fireworks sparkled overhead. Tourists donned party hats to watch fireworks in front of the famous Petronas Twin Towers in Kuala Lumpur, Malaysia. Hundreds of couples got married at a mass wedding in Jakarta on New Year’s Eve. We’re interested

In [3]:
# split into sentences
sentences = nltk.sent_tokenize(article) 

# take one single sentence 

sentence = sentences[1]

print (sentence)

31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage.


In [4]:
# word tokenizer
tokenized_sentence = nltk.word_tokenize(sentence)

print (tokenized_sentence)

['31', ',', '2017', 'In', 'Sydney', ',', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'Harbour', 'Bridge', 'in', 'celebration', 'of', 'Australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [5]:
# lowering words

lowercased_sentence = [word.lower() for word in tokenized_sentence]

print (lowercased_sentence)

['31', ',', '2017', 'in', 'sydney', ',', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'harbour', 'bridge', 'in', 'celebration', 'of', 'australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [6]:
# without punctuation

import string

# defining punctuation to be removed
punctuation = list(string.punctuation)

print (punctuation)


['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [7]:
without_punct_sentence = [token for token in lowercased_sentence if token not in punctuation]

print (without_punct_sentence)

['31', '2017', 'in', 'sydney', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'harbour', 'bridge', 'in', 'celebration', 'of', 'australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage']


In [8]:
from nltk.corpus import stopwords

stop_word_list = stopwords.words('english')

print (stop_word_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
# removing stopwords

without_stopwords_sentence = [word for word in without_punct_sentence if word not in stop_word_list]

print (without_stopwords_sentence)


['31', '2017', 'sydney', 'rainbow', 'fireworks', 'sparkled', 'harbour', 'bridge', 'celebration', 'australia', '’', 'recent', 'legalization', 'gay', 'marriage']


In [10]:
# keeping words (alpha is a "word" not a number)

only_words_sentence = [word for word in without_stopwords_sentence if word.isalpha()]

print (only_words_sentence)


['sydney', 'rainbow', 'fireworks', 'sparkled', 'harbour', 'bridge', 'celebration', 'australia', 'recent', 'legalization', 'gay', 'marriage']


In [11]:
# stemming

# import the library
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")

stem_sentence = [snowball_stemmer.stem(word) for word in only_words_sentence]

print (stem_sentence)

['sydney', 'rainbow', 'firework', 'sparkl', 'harbour', 'bridg', 'celebr', 'australia', 'recent', 'legal', 'gay', 'marriag']


In [12]:
# lemmatization

from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

lemma_sent = [wordnet_lemmatizer.lemmatize(word) for word in only_words_sentence]

print (lemma_sent)

['sydney', 'rainbow', 'firework', 'sparkled', 'harbour', 'bridge', 'celebration', 'australia', 'recent', 'legalization', 'gay', 'marriage']


In [13]:
#POS tagging
# http://www.nltk.org/book/ch05.html

# we need to use the original sentence

tokenized_sentence = nltk.word_tokenize(sentence)


In [14]:
# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(tokenized_sentence)

print (pos_sentence)

[('31', 'CD'), (',', ','), ('2017', 'CD'), ('In', 'IN'), ('Sydney', 'NNP'), (',', ','), ('rainbow', 'NN'), ('fireworks', 'NNS'), ('sparkled', 'VBD'), ('off', 'RP'), ('the', 'DT'), ('Harbour', 'NNP'), ('Bridge', 'NNP'), ('in', 'IN'), ('celebration', 'NN'), ('of', 'IN'), ('Australia', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('recent', 'JJ'), ('legalization', 'NN'), ('of', 'IN'), ('gay', 'JJ'), ('marriage', 'NN'), ('.', '.')]


In [15]:
# combining lemmatization and pos tagging

lemma_word = []

for word,pos in pos_sentence:
    # we lower-case the word (for lemmatization)
    word = word.lower()
    
    # if it's a verb - then we tell that to the lemmatizer
    if pos[0] == "V":
        lemma = wordnet_lemmatizer.lemmatize(word,"v")
    else:
    # otherwise, work as usual
        lemma = wordnet_lemmatizer.lemmatize(word)
    # we append the results
    lemma_word.append(lemma)
print (lemma_word)


#homework - try to write this with a list comprehension

['31', ',', '2017', 'in', 'sydney', ',', 'rainbow', 'firework', 'sparkle', 'off', 'the', 'harbour', 'bridge', 'in', 'celebration', 'of', 'australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']
