In [2]:
# open the new dataset

import codecs, nltk

article = codecs.open("../datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()

# split into sentences
sentences = nltk.sent_tokenize(article) 

# take one single sentence 

sentence = sentences[1]

#tokenize it

tokenized_sentence = nltk.word_tokenize(sentence)

# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(tokenized_sentence)

print (pos_sentence)

[('31', 'CD'), (',', ','), ('2017', 'CD'), ('In', 'IN'), ('Sydney', 'NNP'), (',', ','), ('rainbow', 'NN'), ('fireworks', 'NNS'), ('sparkled', 'VBD'), ('off', 'RP'), ('the', 'DT'), ('Harbour', 'NNP'), ('Bridge', 'NNP'), ('in', 'IN'), ('celebration', 'NN'), ('of', 'IN'), ('Australia', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('recent', 'JJ'), ('legalization', 'NN'), ('of', 'IN'), ('gay', 'JJ'), ('marriage', 'NN'), ('.', '.')]


In [3]:
# combining lemmatization and pos tagging
from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

lemma_words = []

for word,pos in pos_sentence:
    
    # if it's a verb - then we tell that to the lemmatizer
    if pos[0] == "V":
        lemma = wordnet_lemmatizer.lemmatize(word,"v")
    else:
    # otherwise, work as usual
        lemma = wordnet_lemmatizer.lemmatize(word)
    # we append the results
    lemma_words.append(lemma)
    
print (lemma_words)

['31', ',', '2017', 'In', 'Sydney', ',', 'rainbow', 'firework', 'sparkle', 'off', 'the', 'Harbour', 'Bridge', 'in', 'celebration', 'of', 'Australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [4]:
# let's now define a function that does all we need
from nltk.corpus import stopwords

stop_word_list = stopwords.words('english')

# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v") if pos[0] == "V" else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]

    return text

In [5]:
clean_sentence = nlp_pipeline(sentence)
print (clean_sentence)

['sydney', 'rainbow', 'firework', 'sparkle', 'harbour', 'bridge', 'celebration', 'australia', 'recent', 'legalization', 'gay', 'marriage']


In [6]:
# let's take an entire article and use our pipeline!

clean_article = nlp_pipeline(article)
print (clean_article)

['advertisement', 'jacey', 'fortindec', 'sydney', 'rainbow', 'firework', 'sparkle', 'harbour', 'bridge', 'celebration', 'australia', 'recent', 'legalization', 'gay', 'marriage', 'sydney', 'among', 'first', 'major', 'city', 'celebrate', 'firework', 'stroke', 'midnight', 'japan', 'people', 'parade', 'fox', 'mask', 'attend', 'first', 'prayer', 'year', 'shinto', 'shrine', 'tokyo', 'philippine', 'reveler', 'gather', 'phone', 'hand', 'eastwood', 'mall', 'manila', 'watch', 'balloon', 'confetti', 'rain', 'midnight', 'big', 'pot', 'tea', 'prepare', 'new', 'year', 'eve', 'celebration', 'beijing', 'country', 'also', 'celebrate', 'lunar', 'new', 'year', 'february', 'rain', 'singapore', 'new', 'year', 'eve', 'celebrant', 'shelter', 'umbrella', 'raincoat', 'firework', 'sparkle', 'overhead', 'tourist', 'party', 'hat', 'watch', 'firework', 'front', 'famous', 'petronas', 'twin', 'tower', 'kuala', 'lumpur', 'malaysia', 'hundred', 'couple', 'get', 'marry', 'mass', 'wedding', 'jakarta', 'new', 'year', 'ev

In [9]:
print(type(test))
test = set(clean_article)
print(test)

<class 'set'>
{'australia', 'sydney', 'sparkle', 'marriage', 'mask', 'front', 'eve', 'marry', 'go', 'prayer', 'singapore', 'harbour', 'watch', 'reveler', 'celebrate', 'big', 'gather', 'gay', 'jacey', 'feedback', 'page', 'balloon', 'phone', 'philippine', 'tourist', 'recent', 'kuala', 'party', 'among', 'rain', 'fortindec', 'get', 'japan', 'umbrella', 'hat', 'celebrant', 'year', 'eastwood', 'u', 'manila', 'advertisement', 'february', 'wedding', 'parade', 'prepare', 'shrine', 'shinto', 'lunar', 'interested', 'jakarta', 'new', 'celebration', 'home', 'tokyo', 'overhead', 'midnight', 'bridge', 'fox', 'confetti', 'mass', 'hundred', 'twin', 'think', 'malaysia', 'also', 'mall', 'city', 'stroke', 'attend', 'petronas', 'couple', 'legalization', 'beijing', 'tea', 'lumpur', 'rainbow', 'country', 'pot', 'shelter', 'tower', 'first', 'raincoat', 'tell', 'hand', 'firework', 'people', 'famous', 'major'}


In [10]:
# word sense disambiguation

# check documentation: http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn

# let's isolate each word - you do this using a set (another type of object in python)

unique_words = set(clean_article)

# let's check how many senses each word has
for word in unique_words:
    print (word, len(wn.synsets(word)))


australia 2
sydney 1
sparkle 7
marriage 4
mask 9
front 13
eve 4
marry 2
go 35
prayer 5
singapore 3
harbour 6
watch 13
reveler 1
celebrate 3
big 17
gather 11
gay 7
jacey 0
feedback 2
page 9
balloon 4
phone 4
philippine 2
tourist 1
recent 3
kuala 0
party 6
among 0
rain 4
fortindec 0
get 37
japan 5
umbrella 4
hat 4
celebrant 2
year 4
eastwood 0
u 4
manila 2
advertisement 1
february 1
wedding 5
parade 5
prepare 8
shrine 2
shinto 3
lunar 1
interested 5
jakarta 1
new 12
celebration 3
home 17
tokyo 1
overhead 9
midnight 1
bridge 12
fox 10
confetti 1
mass 11
hundred 2
twin 9
think 14
malaysia 1
also 1
mall 2
city 3
stroke 16
attend 5
petronas 0
couple 9
legalization 1
beijing 1
tea 5
lumpur 0
rainbow 2
country 5
pot 10
shelter 7
tower 4
first 16
raincoat 1
tell 9
hand 16
firework 1
people 6
famous 1
major 13


In [11]:
word = "loan"

senses = wn.synsets(word)

for sense in senses:
    # get definition of sense
    print ("\ndefinition")
    print(sense.definition())
    
    # get a textual example
    print ("\nexample")
    print(sense.examples())
    
    # get hypernymy
    print ("\nhypernymy")
    print(sense.hypernyms())

    # get hyponyms
    print ("\nhyponyms")
    print(sense.hyponyms())
        
    # this is a way of getting synonyms - there are others
    print ("\nsynonyms")
    print (sense.lemma_names())
    
    # this is for getting antonyms - works especially with adjectives 
    print ("\nantonyms")
    print (sense.lemmas()[0].antonyms())
    
    print ("\n\n")



definition
the temporary provision of money (usually at interest)

example
[]

hypernymy
[Synset('debt.n.02')]

hyponyms
[Synset('bank_loan.n.01'), Synset('call_loan.n.01'), Synset('direct_loan.n.01'), Synset('home_loan.n.01'), Synset('installment_credit.n.01'), Synset('participation_loan.n.01'), Synset('personal_loan.n.01'), Synset('point.n.18'), Synset('real_estate_loan.n.01'), Synset('time_loan.n.01')]

synonyms
['loan']

antonyms
[]




definition
a word borrowed from another language; e.g. `blitz' is a German word borrowed into modern English

example
[]

hypernymy
[Synset('word.n.01')]

hyponyms
[Synset('gallicism.n.01'), Synset('latinism.n.01')]

synonyms
['loanword', 'loan']

antonyms
[]




definition
give temporarily; let have for a limited time

example
['I will lend you my car', 'loan me some money']

hypernymy
[Synset('give.v.03')]

hyponyms
[Synset('advance.v.10'), Synset('hire_out.v.01'), Synset('trust.v.06')]

synonyms
['lend', 'loan']

antonyms
[Lemma('borrow.v.01.bor

In [12]:
# let's consider two sentences where "cell" is mentioned

sent1 = "The terrorist cell was neutralized near the southern Russian city of Makhachkala, the capital of the Republic of Dagestan."

sent2 = "The molecule, which uses light energy to move protons across a somatic cell membrane, proved unsuitable for crystallography."

# you clean the sentences using our pipeline
clean_sent1 = nlp_pipeline(sent1)

clean_sent2 = nlp_pipeline(sent2)

print ("clean sent 1:", clean_sent1)
print ("clean sent 2:", clean_sent2)
print (" ")

# for each possible sense of "cell" you can, for instance, check the overlap between the definition and the sentence

clean sent 1: ['terrorist', 'cell', 'neutralize', 'near', 'southern', 'russian', 'city', 'makhachkala', 'capital', 'republic', 'dagestan']
clean sent 2: ['molecule', 'use', 'light', 'energy', 'move', 'proton', 'across', 'somatic', 'cell', 'membrane', 'prove', 'unsuitable', 'crystallography']
 


In [30]:
word = "cell"

senses = wn.synsets(word)

for sense in senses:
    
    # get definition of sense
    definition =  sense.definition()
    
    # get hypernymy
    hypernyms = sense.hypernyms()

    # get hyponyms
    hyponyms = sense.hyponyms()
    
    print(hypernyms)
    #print(hyponyms)
    print(hypernyms.definition())
    break
    
    # you clean the definition with our pipeline
    #clean_definition = nlp_pipeline(definition)
    
    # you check the intersection of the two sentences
   # inters_1 = set(clean_sent1).intersection(clean_definition)
    #inters_2 = set(clean_sent2).intersection(clean_definition)
    
    #print (definition)
    #print ("clean definition:", clean_definition)
    #print ("intersection with sent 1:", inters_1)
    #print ("intersection with sent 2:", inters_2)
    #print (len(inters_1),len(inters_2))
    #print (" ")

[Synset('compartment.n.01')]


AttributeError: 'list' object has no attribute 'definition'

In [None]:
word = "cell"

senses = wn.synsets(word)

for sense in senses:
    
    # get definition of sense
    definition =  sense.definition()
    
    # you clean the definition with our pipeline
    clean_definition = nlp_pipeline(definition)
    
    # you check the intersection of the two sentences
    inters_1 = set(clean_sent1).intersection(clean_definition)
    inters_2 = set(clean_sent2).intersection(clean_definition)
    
    print (definition)
    print ("clean definition:", clean_definition)
    print ("intersection with sent 1:", inters_1)
    print ("intersection with sent 2:", inters_2)
    print (len(inters_1),len(inters_2))
    print (" ")

In [19]:
# homework: find the best sense - implement your version of the Lesk algorithm: https://en.wikipedia.org/wiki/Lesk_algorithm

word = "cell"

senses = wn.synsets(word)

for sense in senses:
    
    # get definition of sense
    definition =  sense.definition()
    
    # you clean the definition with our pipeline
    clean_definition = nlp_pipeline(definition)
    
    # you check the intersection of the two sentences
    inters_1 = set(clean_sent1).intersection(clean_definition)
    inters_2 = set(clean_sent2).intersection(clean_definition)
    
    print (definition)
    print ("clean definition:", clean_definition)
    print ("intersection with sent 1:", inters_1)
    print ("intersection with sent 2:", inters_2)
    print (len(inters_1),len(inters_2))
    print (" ")

homework 2: get the json file with the tweets from Donald Trump and improve his vocabulary by changing his poor choice of adjectives with more sophisticated synonyms (e.g. "bad ratings on the Emmys last night" -> "substandard ratings on the Emmys last night") 
 
or

make his tweets nicer by changing adjectives with related antonyms (e.g. "bad ratings on the Emmys last night" -> "excellent ratings on the Emmys last night") 

to do you need to combine:

- text processing (POS tagging + WordNet)
- and to find a solution for knowing if a word is "more sophisticated" than another one


In [37]:
import json

with open('../datasets/trump.json') as f:
    tweets = json.load(f)

print (len(tweets))
print(type(tweets))

35018
<class 'list'>


In [46]:
for tweet in tweets:
    if "worst" in tweet["text"]:
        clean_tweet = nlp_pipeline(tweet["text"])
        print(clean_tweet)
        break

['happy', 'labor', 'day', 'country', 'better', 'ever', 'unemployment', 'setting', 'record', 'low', 'tremendous', 'upside', 'potential', 'go', 'fix', 'worst', 'trade', 'deal', 'ever', 'make', 'country', 'world', 'big', 'progress', 'make']


In [83]:
worst = wn.synset('worst.a.01')
syn = worst.lemmas()[0].antonyms()
    