In [1]:
import nltk

def format_sentence(sent):
    return {word: True for word in nltk.word_tokenize(sent)}
    
pos = []
with open("./pos.txt") as f:
    for i in f: 
        pos.append([format_sentence(i.decode('utf-8')), 'pos'])
        
neg = []
with open("./neg.txt") as f:
    for i in f: 
        neg.append([format_sentence(i.decode('utf-8')), 'neg'])

In [2]:
training = pos[:int((.9)*len(pos))] + neg[:int((.9)*len(neg))]

In [3]:
test = pos[int((.1)*len(pos)):] + neg[int((.1)*len(neg)):]

In [4]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training)

In [5]:
classifier.show_most_informative_features()

Most Informative Features
                      no = True              neg : pos    =     20.6 : 1.0
                 awesome = True              pos : neg    =     18.7 : 1.0
                headache = True              neg : pos    =     18.0 : 1.0
               beautiful = True              pos : neg    =     14.2 : 1.0
                    love = True              pos : neg    =     14.2 : 1.0
                      Hi = True              pos : neg    =     12.7 : 1.0
                    glad = True              pos : neg    =      9.7 : 1.0
                   Thank = True              pos : neg    =      9.7 : 1.0
                     fan = True              pos : neg    =      9.7 : 1.0
                    lost = True              neg : pos    =      9.3 : 1.0


In [6]:
example1 = "this workshop is awesome."

print classifier.classify(format_sentence(example1))

pos


In [7]:
example1 = "this workshop is aweful."

print classifier.classify(format_sentence(example1))

neg


In [8]:
from nltk.classify.util import accuracy
print accuracy(classifier, test)

0.956232686981


In [9]:
import nltk 

text = nltk.word_tokenize("Python is an awesome language!")
nltk.pos_tag(text)

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('awesome', 'JJ'),
 ('language', 'NN'),
 ('!', '.')]

In [10]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [11]:
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])


[(u'Various', u'JJ'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'apartments', u'NNS'),
 (u'are', u'BER'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'terrace', u'NN'),
 (u'type', u'NN'),
 (u',', u','),
 (u'being', u'BEG'),
 (u'on', u'IN'),
 (u'the', u'AT'),
 (u'ground', u'NN'),
 (u'floor', u'NN'),
 (u'so', u'QL'),
 (u'that', u'CS'),
 (u'entrance', u'NN'),
 (u'is', u'BEZ'),
 (u'direct', u'JJ'),
 (u'.', u'.')]

In [12]:
bigram_tagger = nltk.BigramTagger(brown_tagged_sents)
bigram_tagger.tag(brown_sents[2007])

[(u'Various', u'JJ'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'apartments', u'NNS'),
 (u'are', u'BER'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'terrace', u'NN'),
 (u'type', u'NN'),
 (u',', u','),
 (u'being', u'BEG'),
 (u'on', u'IN'),
 (u'the', u'AT'),
 (u'ground', u'NN'),
 (u'floor', u'NN'),
 (u'so', u'CS'),
 (u'that', u'CS'),
 (u'entrance', u'NN'),
 (u'is', u'BEZ'),
 (u'direct', u'JJ'),
 (u'.', u'.')]

In [13]:
raw = "OMG, Natural Language Processing is SO cool and I'm really enjoying this workshop!"
tokens = nltk.word_tokenize(raw)
tokens = [i.lower() for i in tokens]

In [15]:
lancaster = nltk.LancasterStemmer()
stems = [lancaster.stem(i) for i in tokens]
stems

['omg',
 ',',
 'nat',
 'langu',
 'process',
 'is',
 'so',
 'cool',
 'and',
 'i',
 "'m",
 'real',
 'enjoy',
 'thi',
 'workshop',
 '!']

In [17]:
porter = nltk.PorterStemmer()
stem = [porter.stem(i) for i in tokens]
stem

[u'omg',
 u',',
 u'natur',
 u'languag',
 u'process',
 u'is',
 u'so',
 u'cool',
 u'and',
 u'i',
 u"'m",
 u'realli',
 u'enjoy',
 u'thi',
 u'workshop',
 u'!']

In [19]:
from nltk import WordNetLemmatizer

lemma = nltk.WordNetLemmatizer()
text = "Women in technology are amazing at coding"
ex = [i.lower() for i in text.split()]

lemmas = [lemma.lemmatize(i) for i in ex]
lemmas

[u'woman', 'in', 'technology', 'are', 'amazing', 'at', 'coding']