In [5]:
# open the new dataset

import codecs, nltk

article = codecs.open("/Users/rebeccaweiss/Desktop/Comp-Text-Analysis-2018-19-master/datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()

# split into sentences
sentences = nltk.sent_tokenize(article) 

# take one single sentence 

sentence = sentences[1]

#tokenize it

tokenized_sentence = nltk.word_tokenize(sentence)

# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(tokenized_sentence)

print (pos_sentence)
#need entire sentence, shouldn't/don't remove stopwords, need things like "the" to help recognize nouns

#if want to be able to read, so NNP = proper noun, look at documentation

[('31', 'CD'), (',', ','), ('2017', 'CD'), ('In', 'IN'), ('Sydney', 'NNP'), (',', ','), ('rainbow', 'NN'), ('fireworks', 'NNS'), ('sparkled', 'VBD'), ('off', 'RP'), ('the', 'DT'), ('Harbour', 'NNP'), ('Bridge', 'NNP'), ('in', 'IN'), ('celebration', 'NN'), ('of', 'IN'), ('Australia', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('recent', 'JJ'), ('legalization', 'NN'), ('of', 'IN'), ('gay', 'JJ'), ('marriage', 'NN'), ('.', '.')]


In [10]:
#for thing like sparkled, need POS before lemmatizer, or else won't know what to do with it
# combining lemmatization and pos tagging
from nltk.stem.wordnet import WordNetLemmatizer #define lemmatizer

wordnet_lemmatizer = WordNetLemmatizer() #this is our lemmatizer, need to write this way because later
#will write a command

lemma_words = [] #define an empty list

for word,pos in pos_sentence: #loop over part of speech tag, each element in list consists of token 
    #and part of speech
    
    #could be word or a noun, common thing/disambiguity
    # if it's a verb - then we tell that to the lemmatizer
    if pos[0] == "V": #if part of speech starts with a V
        lemma = wordnet_lemmatizer.lemmatize(word,"v") #then lemmatize it as a word, just say word, "v"
        #there is a difference in capital and lower case, lemmatizer wants lower case of everything
        #lemmatizer assumes things are noun if things are undecided, so if something is a verb, need to
        #say it
        
        #if have disambuity with adjectives, then do it with adjectives
        #but most problems when  dealing with a verb
    else:
    # otherwise, work as usual
        lemma = wordnet_lemmatizer.lemmatize(word) #lemmatizer gives you back a string
    # we append the results
    lemma_words.append(lemma) #puts the string into a list
    
print (lemma_words)

#Note: this can be very slow if lemmatizing all speeches in a corpus

#if want to keep sparkled in current form, then can keep as is above in POS, so list of words and
#the part of speech they have, if want that information

#always depends on what you want to do


['31', ',', '2017', 'In', 'Sydney', ',', 'rainbow', 'firework', 'sparkle', 'off', 'the', 'Harbour', 'Bridge', 'in', 'celebration', 'of', 'Australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [12]:
# let's now define a function that does all we need
from nltk.corpus import stopwords

stop_word_list = stopwords.words('english') #call stopword list if want to remove stopwords

# input should be a string
def nlp_pipeline(text): #this function takes text as a string as an input
    #everytime write a function, write somewhere what type of object the function takes
    
    #put all the commands in a function, don't have to write a for loop every time
    #nice way to code and present your code, and can easily check bugs
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token,"v") if pos[0] == "V" else wordnet_lemmatizer.lemmatize(token) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]
    #can keep using the same name for text over and over again, just need to make sure not missing
    #any steps
    return text

#can also put a counter that tells you how long it will take to process something, esp if 
#something is really long

In [16]:
print(sentence)
clean_sentence = nlp_pipeline(sentence) #use it this way
#function name(what you want to put into the function)
print (clean_sentence)

#always double check, check over and over to make sure does what want, then use it for everything
#then be very clear on what you did and how you did it

31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage.
['In', 'Sydney', 'rainbow', 'firework', 'sparkle', 'Harbour', 'Bridge', 'celebration', 'Australia', 'recent', 'legalization', 'gay', 'marriage']


In [14]:
# let's take an entire article and use our pipeline!

clean_article = nlp_pipeline(article)
print (clean_article)
#for something like topic detection, this is way more than fine
#not super useful if want to do something more fine grained

#not really great for tweets

['Advertisement', 'By', 'JACEY', 'FORTINDEC', 'In', 'Sydney', 'rainbow', 'firework', 'sparkle', 'Harbour', 'Bridge', 'celebration', 'Australia', 'recent', 'legalization', 'gay', 'marriage', 'Sydney', 'among', 'first', 'major', 'city', 'celebrate', 'firework', 'stroke', 'midnight', 'In', 'Japan', 'people', 'parade', 'fox', 'mask', 'attend', 'first', 'prayer', 'year', 'Shinto', 'shrine', 'Tokyo', 'In', 'Philippines', 'reveler', 'gather', 'phone', 'hand', 'Eastwood', 'Mall', 'Manila', 'watch', 'balloon', 'confetti', 'rain', 'midnight', 'Big', 'pot', 'tea', 'prepare', 'New', 'Year', 'Eve', 'celebration', 'Beijing', 'The', 'country', 'also', 'celebrate', 'Lunar', 'New', 'Year', 'February', 'It', 'rain', 'Singapore', 'New', 'Year', 'Eve', 'celebrant', 'shelter', 'umbrella', 'raincoat', 'firework', 'sparkle', 'overhead', 'Tourists', 'party', 'hat', 'watch', 'firework', 'front', 'famous', 'Petronas', 'Twin', 'Towers', 'Kuala', 'Lumpur', 'Malaysia', 'Hundreds', 'couple', 'get', 'marry', 'mass',

In [18]:
# word sense disambiguation

# check documentation: http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn #how to use wordnet library in nltk

#can get info on different sense of each token you have

#want to get all the words in an article for wordnet, but want to get unique words

# let's isolate each word - you do this using a set (another type of object in python)

unique_words = set(clean_article) #one instance for each word - might want to lower case, or else
#some inconsistencies

#using a set, get one single instance of each word

#print(unique_words)

# let's check how many senses each word has
for word in unique_words:
    print (word, len(wn.synsets(word))) #for each word, how many sense it has
    #synset is a list of sense, and each sense has different synonyms 

year 4
prayer 5
raincoat 1
party 6
also 1
JACEY 0
Mall 2
Eastwood 0
Australia 2
hand 16
Tourists 1
New 12
recent 3
pot 10
By 2
legalization 1
u 4
gay 7
shrine 2
attend 5
stroke 16
marriage 4
Sydney 1
midnight 1
country 5
Go 35
Home 17
prepare 8
overhead 9
famous 1
It 1
city 3
among 0
Page 9
confetti 1
Petronas 0
Bridge 12
feedback 2
We 0
Year 4
rain 4
mask 9
celebration 3
Kuala 0
Twin 9
reveler 1
celebrant 2
Jakarta 1
first 16
Philippines 3
parade 5
balloon 4
Lumpur 0
Shinto 3
get 37
people 6
celebrate 3
February 1
FORTINDEC 0
In 7
shelter 7
hat 4
gather 11
tea 5
couple 9
rainbow 2
fox 10
Towers 4
Tell 9
Eve 4
interested 5
Beijing 1
Hundreds 1
page 9
wedding 5
major 13
Harbour 6
Advertisement 1
Manila 2
firework 1
umbrella 4
Malaysia 1
Japan 5
Tokyo 1
marry 2
Singapore 3
watch 13
sparkle 7
The 0
phone 4
front 13
mass 11
think 14
Lunar 1
Big 17


In [21]:
word = "cell" #can also use other things aside from wordnets, like thesaurus, sometimes useful

#can also search it on wordnetweb website, see if it gives you stuff you care about

senses = wn.synsets(word)

for sense in senses:
    # get definition of sense
    print ("\ndefinition")
    print(sense.definition())
    
    # get a textual example
    print ("\nexample")
    print(sense.examples())
    
    # get hypernymy
    print ("\nhypernymy")
    print(sense.hypernyms())

    # get hyponyms
    print ("\nhyponyms")
    print(sense.hyponyms())
        
    # this is a way of getting synonyms - there are others
    print ("\nsynonyms")
    print (sense.lemma_names())
    
    # this is for getting antonyms - works especially with adjectives 
    print ("\nantonyms")
    print (sense.lemmas()[0].antonyms())
    
    print ("\n\n")


definition
any small compartment

example
['the cells of a honeycomb']

hypernymy
[Synset('compartment.n.01')]

hyponyms
[]

synonyms
['cell']

antonyms
[]




definition
(biology) the basic structural and functional unit of all organisms; they may exist as independent units of life (as in monads) or may form colonies or tissues as in higher plants and animals

example
[]

hypernymy
[Synset('living_thing.n.01')]

hyponyms
[Synset('akaryocyte.n.01'), Synset('archespore.n.01'), Synset('arthrospore.n.01'), Synset('arthrospore.n.02'), Synset('beta_cell.n.01'), Synset('blastema.n.01'), Synset('blastomere.n.01'), Synset('daughter_cell.n.01'), Synset('embryonic_cell.n.01'), Synset('fiber.n.03'), Synset('flagellated_cell.n.01'), Synset('gametocyte.n.01'), Synset('kupffer's_cell.n.01'), Synset('leydig_cell.n.01'), Synset('mother_cell.n.01'), Synset('parthenote.n.01'), Synset('plant_cell.n.01'), Synset('polar_body.n.01'), Synset('recombinant.n.01'), Synset('reproductive_cell.n.01'), Synset('ser

In [20]:
# let's consider two sentences where "cell" is mentioned

sent1 = "The terrorist cell was neutralized near the southern Russian city of Makhachkala, the capital of the Republic of Dagestan."

sent2 = "The molecule, which uses light energy to move protons across a somatic cell membrane, proved unsuitable for crystallography."

# you clean the sentences using our pipeline
clean_sent1 = nlp_pipeline(sent1)

clean_sent2 = nlp_pipeline(sent2)

print ("clean sent 1:", clean_sent1)
print ("clean sent 2:", clean_sent2)
print (" ")

#compare the two sentences, and find most similar one, do this for homework
# for each possible sense of "cell" you can, for instance, check the overlap between 
#the definition and the sentence 

clean sent 1: ['The', 'terrorist', 'cell', 'neutralize', 'near', 'southern', 'Russian', 'city', 'Makhachkala', 'capital', 'Republic', 'Dagestan']
clean sent 2: ['The', 'molecule', 'use', 'light', 'energy', 'move', 'proton', 'across', 'somatic', 'cell', 'membrane', 'prove', 'unsuitable', 'crystallography']
 


In [None]:
#try to write function
#compare the definition with the context, and try to find a way of counting how many words in 
#definition are in common with the context, counting how many words they have in common
word = "cell"

senses = wn.synsets(word)

for sense in senses:

In [None]:
# homework: find the best sense - 
#implement your version of the Lesk algorithm: https://en.wikipedia.org/wiki/Lesk_algorithm

In [None]:
#potential homework 2: get the json fie with the tweets from donald trump and
#improve his vocab by changing his poor choice of adjectives with
#more sophisticated synonyms

#or

#make his tweets nicer by changing adjectives related to antonyms

#or combine