### Tokenizing

In [1]:
import nltk

In [2]:
f = open('wordfile4.txt', encoding="utf-8")
text = f.read()
text

'Yesterday (Sept. 25), a stark new climate report came out that showed climate change is progressing much faster than anticipated, but it\'s not too late for humans to make changes.\n\nThe report, written and released by the United Nations-led Intergovernmental Panel on Climate Change (IPCC), and formally known as the IPCC Special Report on the Ocean and Cryosphere in a Changing Climate, details the most up-to-date understanding of climate change, its causes, how it will continue to impact us on Earth and what we can do about it. The report looks to 2100 to see both where we will be if we continue on as we currently are, or if major changes are made to mitigate contributing factors like carbon dioxide emissions.\n\nMain takeaway from the report? "We\'re seeing that climate change impacts are already happening, were seeing that they\'re happening at a faster pace than before," Ben Orlove, a professor of public policy at Columbia University and a lead author on the report, said to Space.

In [3]:
tokens = nltk.word_tokenize(text)

In [4]:
print(tokens)

['Yesterday', '(', 'Sept.', '25', ')', ',', 'a', 'stark', 'new', 'climate', 'report', 'came', 'out', 'that', 'showed', 'climate', 'change', 'is', 'progressing', 'much', 'faster', 'than', 'anticipated', ',', 'but', 'it', "'s", 'not', 'too', 'late', 'for', 'humans', 'to', 'make', 'changes', '.', 'The', 'report', ',', 'written', 'and', 'released', 'by', 'the', 'United', 'Nations-led', 'Intergovernmental', 'Panel', 'on', 'Climate', 'Change', '(', 'IPCC', ')', ',', 'and', 'formally', 'known', 'as', 'the', 'IPCC', 'Special', 'Report', 'on', 'the', 'Ocean', 'and', 'Cryosphere', 'in', 'a', 'Changing', 'Climate', ',', 'details', 'the', 'most', 'up-to-date', 'understanding', 'of', 'climate', 'change', ',', 'its', 'causes', ',', 'how', 'it', 'will', 'continue', 'to', 'impact', 'us', 'on', 'Earth', 'and', 'what', 'we', 'can', 'do', 'about', 'it', '.', 'The', 'report', 'looks', 'to', '2100', 'to', 'see', 'both', 'where', 'we', 'will', 'be', 'if', 'we', 'continue', 'on', 'as', 'we', 'currently', 'ar

### Stemming

In [5]:
from nltk.stem import PorterStemmer

In [6]:
stemmer = PorterStemmer()

In [7]:
print([stemmer.stem(wd) for wd in tokens])

['yesterday', '(', 'sept.', '25', ')', ',', 'a', 'stark', 'new', 'climat', 'report', 'came', 'out', 'that', 'show', 'climat', 'chang', 'is', 'progress', 'much', 'faster', 'than', 'anticip', ',', 'but', 'it', "'s", 'not', 'too', 'late', 'for', 'human', 'to', 'make', 'chang', '.', 'the', 'report', ',', 'written', 'and', 'releas', 'by', 'the', 'unit', 'nations-l', 'intergovernment', 'panel', 'on', 'climat', 'chang', '(', 'ipcc', ')', ',', 'and', 'formal', 'known', 'as', 'the', 'ipcc', 'special', 'report', 'on', 'the', 'ocean', 'and', 'cryospher', 'in', 'a', 'chang', 'climat', ',', 'detail', 'the', 'most', 'up-to-d', 'understand', 'of', 'climat', 'chang', ',', 'it', 'caus', ',', 'how', 'it', 'will', 'continu', 'to', 'impact', 'us', 'on', 'earth', 'and', 'what', 'we', 'can', 'do', 'about', 'it', '.', 'the', 'report', 'look', 'to', '2100', 'to', 'see', 'both', 'where', 'we', 'will', 'be', 'if', 'we', 'continu', 'on', 'as', 'we', 'current', 'are', ',', 'or', 'if', 'major', 'chang', 'are', 'ma

### POS Tagging

In [8]:
pos_text = nltk.pos_tag(tokens)

In [9]:
print(pos_text)

[('Yesterday', 'NN'), ('(', '('), ('Sept.', 'NNP'), ('25', 'CD'), (')', ')'), (',', ','), ('a', 'DT'), ('stark', 'JJ'), ('new', 'JJ'), ('climate', 'NN'), ('report', 'NN'), ('came', 'VBD'), ('out', 'RP'), ('that', 'IN'), ('showed', 'VBD'), ('climate', 'JJ'), ('change', 'NN'), ('is', 'VBZ'), ('progressing', 'VBG'), ('much', 'RB'), ('faster', 'RBR'), ('than', 'IN'), ('anticipated', 'VBN'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ("'s", 'VBZ'), ('not', 'RB'), ('too', 'RB'), ('late', 'JJ'), ('for', 'IN'), ('humans', 'NNS'), ('to', 'TO'), ('make', 'VB'), ('changes', 'NNS'), ('.', '.'), ('The', 'DT'), ('report', 'NN'), (',', ','), ('written', 'VBN'), ('and', 'CC'), ('released', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('Nations-led', 'JJ'), ('Intergovernmental', 'NNP'), ('Panel', 'NNP'), ('on', 'IN'), ('Climate', 'NNP'), ('Change', 'NNP'), ('(', '('), ('IPCC', 'NNP'), (')', ')'), (',', ','), ('and', 'CC'), ('formally', 'RB'), ('known', 'VBN'), ('as', 'IN'), ('the', 'DT'), ('I

### Lemmatization

In [17]:
def convert_tags_to_wordnet(tag):
    if tag.startswith('V'):
        return 'v'
    elif tag.startswith('J'):
        return 'a'
    elif tag.startswith('R'):
        return 'r'
    elif tag.startswith('N'):
        return 'n'
    else:
        return None

In [18]:
lemma = nltk.WordNetLemmatizer()
w = []
l = []

In [19]:
for word in pos_text:
    converted_tag = convert_tags_to_wordnet(word[1])
    if(converted_tag is not None):
        out = lemma.lemmatize(word[0],converted_tag)
        print(word[0] + ' :' + out + ' - ' + converted_tag)

Yesterday :Yesterday - n
Sept. :Sept. - n
stark :stark - a
new :new - a
climate :climate - n
report :report - n
came :come - v
out :out - r
showed :show - v
climate :climate - a
change :change - n
is :be - v
progressing :progress - v
much :much - r
faster :faster - r
anticipated :anticipate - v
's :'s - v
not :not - r
too :too - r
late :late - a
humans :human - n
make :make - v
changes :change - n
report :report - n
written :write - v
released :release - v
United :United - n
Nations-led :Nations-led - a
Intergovernmental :Intergovernmental - n
Panel :Panel - n
Climate :Climate - n
Change :Change - n
IPCC :IPCC - n
formally :formally - r
known :know - v
IPCC :IPCC - n
Special :Special - n
Report :Report - n
Ocean :Ocean - n
Cryosphere :Cryosphere - n
Changing :Changing - n
Climate :Climate - n
details :detail - n
most :most - r
up-to-date :up-to-date - a
understanding :understanding - n
climate :climate - n
change :change - n
causes :cause - n
continue :continue - v
impact :impact - v
E