# Named Entity Recognition
NER is the process of identifying and extracting named entities from text, such as person names, organization names, locations, dates, etc

In [8]:
import numpy
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
train_text= state_union.raw("2005-GWBush.txt")
sample_text= state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer= PunktSentenceTokenizer(sample_text)

tokenized= custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            # tags each word with its part of speech
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #it just remove period like it now considers White house now same
            namedEnt=nltk.ne_chunk(tagged, binary= True)
            namedEnt.draw()
        

    except Exception as e:
        print(str(e))
process_content()

# Lemmatizing
Lemmatizing is a natural language processing technique that involves reducing words to their base or dictionary form, which is called a "lemma".
Lemmatizing is similar to stemming, but it involves using a dictionary or knowledge of the language to properly transform words to their base form. 

# Why Lemmatizing?
Lemmatizing is useful in natural language processing tasks such as text classification, sentiment analysis, and information retrieval

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
#default parameter for pos is noun{n}
print(lemmatizer.lemmatize("better",pos="a"))
print(lemmatizer.lemmatize("run",pos="v"))




cat
cactus
goose
rock
python
good
run


# Corpora
In natural language processing, a corpus (plural: corpora) refers to a large and structured set of text or speech data that is used for language analysis and research. A corpus can include any type of written or spoken language, such as books, newspapers, blogs, social media posts, speeches, and more.

# Why Copora?
Corpora are important in natural language processing because they provide a large and diverse set of data that can be used to train machine learning algorithms, test hypotheses about language use and structure, and build language models. Corpora can also be used to study patterns of language use across different contexts, such as historical periods or geographic regions.

In [10]:
#Finding the corpus
from nltk.corpus import gutenberg
sample= gutenberg.raw("bible-kjv.txt")
tok=sent_tokenize(sample)
print(tok[5:15])

['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

# WordNet
It is a large database of English words and their semantic relationships, organized into sets of synonyms, or "synsets".
It provides a powerful tool for identifying relationships between words and concepts, and can help improve the accuracy and effectiveness of natural language processing algorithms.

In [15]:
from nltk.corpus import wordnet
syns=wordnet.synsets("program")
print(syns)
print(syns[0])
print(syns[0].lemmas())
print(syns[0].lemmas()[0].name())

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]
Synset('plan.n.01')
[Lemma('plan.n.01.plan'), Lemma('plan.n.01.program'), Lemma('plan.n.01.programme')]
plan


In [17]:
#definition
print(syns[0].definition())

#examples
print(syns[0].examples())


a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [20]:
synonyms=[]
antonyms=[]

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'dependable', 'upright', 'good', 'goodness', 'unspoilt', 'ripe', 'unspoiled', 'adept', 'salutary', 'skilful', 'expert', 'trade_good', 'well', 'honest', 'respectable', 'just', 'proficient', 'soundly', 'estimable', 'secure', 'sound', 'thoroughly', 'effective', 'near', 'full', 'right', 'beneficial', 'honorable', 'serious', 'dear', 'commodity', 'practiced', 'safe', 'skillful', 'in_effect', 'undecomposed', 'in_force'}
{'bad', 'ill', 'evilness', 'badness', 'evil'}


# Similarity

In [21]:
w1= wordnet.synset("ship.n.01")
w2=wordnet.synset("boat.n.01")

#wup defines similarities between words
print(w1.wup_similarity(w2))

0.9090909090909091


In [22]:
w1= wordnet.synset("ship.n.01")
w2=wordnet.synset("car.n.01")

#wup defines similarities between words
print(w1.wup_similarity(w2))

0.6956521739130435


In [23]:
w1= wordnet.synset("ship.n.01")
w2=wordnet.synset("cat.n.01")

#wup defines similarities between words
print(w1.wup_similarity(w2))

0.32
