In [8]:
%config InteractiveShell.ast_node_interactivity = 'all'

# Delitev na stavke

In [29]:
from nltk.tokenize import sent_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

In [27]:
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


# Delitev na besede

In [30]:
from nltk.tokenize import word_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

In [31]:
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


# Ngrami nad besedami

In [21]:
from nltk import ngrams

string = "I really like python, it's pretty awesome."

sentence = 'this is a foo bar sentences and i want to ngramize it'
n = 3
ngrams = ngrams(sentence.split(), n)
for grams in ngrams:
    print (grams)

('this', 'is', 'a')
('is', 'a', 'foo')
('a', 'foo', 'bar')
('foo', 'bar', 'sentences')
('bar', 'sentences', 'and')
('sentences', 'and', 'i')
('and', 'i', 'want')
('i', 'want', 'to')
('want', 'to', 'ngramize')
('to', 'ngramize', 'it')


# Ngrami nad crkami

In [22]:
def word2ngrams(text, n=3, exact=True):
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

word2ngrams(' Hello ')

[' He', 'Hel', 'ell', 'llo', 'lo ']

In [23]:
from nltk.tokenize import word_tokenize

def zip_ngrams(text, n=3, exact=True):
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

def nozip_ngrams(text, n=3):
    return [text[i:i+n] for i in range(len(text)-n+1)]

sentence = 'this is a foo bar sentences and i want to ngramize it'
words = word_tokenize(sentence)

x = [zip_ngrams(w) for w in words]
y = [nozip_ngrams(w) for w in words]

print ('%s\n%s' % (x, y))

[['thi', 'his'], [], [], ['foo'], ['bar'], ['sen', 'ent', 'nte', 'ten', 'enc', 'nce', 'ces'], ['and'], [], ['wan', 'ant'], [], ['ngr', 'gra', 'ram', 'ami', 'miz', 'ize'], []]
[['thi', 'his'], [], [], ['foo'], ['bar'], ['sen', 'ent', 'nte', 'ten', 'enc', 'nce', 'ces'], ['and'], [], ['wan', 'ant'], [], ['ngr', 'gra', 'ram', 'ami', 'miz', 'ize'], []]


# Stop words

In [32]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [33]:
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


# Stemming

In [34]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [35]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [36]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


# Part of Speech Tagging

In [18]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [38]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print (tagged)
    except Exception as e:
        print(str(e))

process_content()

0.04020979020979021 0.07357859531772576 0.03836930455635491 5720 299 230 22
0.002972027972027972 0.0033444816053511705 0.002951484965873455 5720 299 17 1
0.0008741258741258741 0.006688963210702341 0.0005534034311012728 5720 299 5 2
0.0024475524475524478 0.016722408026755852 0.0016602102933038186 5720 299 14 5
0.0013986013986013986 0.0033444816053511705 0.0012912746725696365 5720 299 8 1
0.0034965034965034965 0.0033444816053511705 0.003504888396974728 5720 299 20 1
0.017132867132867134 0.04013377926421405 0.01586423169156982 5720 299 98 12
0.005244755244755245 0.006688963210702341 0.005165098690278546 5720 299 30 2
0.005944055944055944 0.026755852842809364 0.004796163069544364 5720 299 34 8
0.04493006993006993 0.043478260869565216 0.04501014572957019 5720 299 257 13
0.00017482517482517483 0.0033444816053511705 0.0 5720 299 1 1
0.016258741258741258 0.0802675585284281 0.012728278915329275 5720 299 93 24
0.001048951048951049 0.0033444816053511705 0.0009223390518354548 5720 299 6 1
0.029545

# Chunking

In [40]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

import nltk.draw

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()     

    except Exception as e:
        print(str(e))

process_content()

0.04020979020979021 0.07357859531772576 0.03836930455635491 5720 299 230 22
0.002972027972027972 0.0033444816053511705 0.002951484965873455 5720 299 17 1
0.0008741258741258741 0.006688963210702341 0.0005534034311012728 5720 299 5 2
0.0024475524475524478 0.016722408026755852 0.0016602102933038186 5720 299 14 5
0.0013986013986013986 0.0033444816053511705 0.0012912746725696365 5720 299 8 1
0.0034965034965034965 0.0033444816053511705 0.003504888396974728 5720 299 20 1
0.017132867132867134 0.04013377926421405 0.01586423169156982 5720 299 98 12
0.005244755244755245 0.006688963210702341 0.005165098690278546 5720 299 30 2
0.005944055944055944 0.026755852842809364 0.004796163069544364 5720 299 34 8
0.04493006993006993 0.043478260869565216 0.04501014572957019 5720 299 257 13
0.00017482517482517483 0.0033444816053511705 0.0 5720 299 1 1
0.016258741258741258 0.0802675585284281 0.012728278915329275 5720 299 93 24
0.001048951048951049 0.0033444816053511705 0.0009223390518354548 5720 299 6 1
0.029545

In [41]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

            chunked.draw()

    except Exception as e:
        print(str(e))

process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
libtk8.6.so: cannot open shared object file: No such file or directory


In [42]:
def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

    except Exception as e:
        print(str(e))

process_content()

libtk8.6.so: cannot open shared object file: No such file or directory


In [43]:
def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()
    except Exception as e:
        print(str(e))

process_content()

libtk8.6.so: cannot open shared object file: No such file or directory


# Lemmatizing

In [44]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


# Wordnet

In [45]:
from nltk.corpus import wordnet

In [47]:
syns = wordnet.synsets("program")
print(syns[0].name())
print(syns[0].lemmas()[0].name())
print(syns[0].definition())
print(syns[0].examples())

plan.n.01
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [48]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'dear', 'just', 'thoroughly', 'full', 'goodness', 'beneficial', 'respectable', 'serious', 'unspoilt', 'safe', 'upright', 'good', 'skillful', 'right', 'in_effect', 'expert', 'unspoiled', 'trade_good', 'honest', 'ripe', 'undecomposed', 'secure', 'adept', 'soundly', 'salutary', 'proficient', 'skilful', 'commodity', 'estimable', 'effective', 'dependable', 'in_force', 'practiced', 'sound', 'honorable', 'well', 'near'}
{'badness', 'evilness', 'evil', 'bad', 'ill'}


In [49]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


## Ngrami nad besedami

In [21]:
from nltk import ngrams

string = "I really like python, it's pretty awesome."

sentence = 'this is a foo bar sentences and i want to ngramize it'
n = 3
ngrams = ngrams(sentence.split(), n)
for grams in ngrams:
    print (grams)

('this', 'is', 'a')
('is', 'a', 'foo')
('a', 'foo', 'bar')
('foo', 'bar', 'sentences')
('bar', 'sentences', 'and')
('sentences', 'and', 'i')
('and', 'i', 'want')
('i', 'want', 'to')
('want', 'to', 'ngramize')
('to', 'ngramize', 'it')


## Ngrami nad crkami

In [22]:
def word2ngrams(text, n=3, exact=True):
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

word2ngrams(' Hello ')

[' He', 'Hel', 'ell', 'llo', 'lo ']

In [23]:
from nltk.tokenize import word_tokenize

def zip_ngrams(text, n=3, exact=True):
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

def nozip_ngrams(text, n=3):
    return [text[i:i+n] for i in range(len(text)-n+1)]

sentence = 'this is a foo bar sentences and i want to ngramize it'
words = word_tokenize(sentence)

x = [zip_ngrams(w) for w in words]
y = [nozip_ngrams(w) for w in words]

print ('%s\n%s' % (x, y))

[['thi', 'his'], [], [], ['foo'], ['bar'], ['sen', 'ent', 'nte', 'ten', 'enc', 'nce', 'ces'], ['and'], [], ['wan', 'ant'], [], ['ngr', 'gra', 'ram', 'ami', 'miz', 'ize'], []]
[['thi', 'his'], [], [], ['foo'], ['bar'], ['sen', 'ent', 'nte', 'ten', 'enc', 'nce', 'ces'], ['and'], [], ['wan', 'ant'], [], ['ngr', 'gra', 'ram', 'ami', 'miz', 'ize'], []]


In [50]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [51]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))

0.6956521739130435


In [52]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


# Text Classification

In [53]:
import nltk
import random
from nltk.corpus import movie_reviews

In [55]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

(['i', "'", 'll', 'be', 'the', 'first', 'to', 'admit', 'it', '.', 'when', 'you', 'mention', 'the', 'book', 'great', 'expectations', ',', 'i', 'immediately', 'begin', 'experiencing', 'flashbacks', 'to', 'junior', 'high', 'english', 'class', ',', 'where', 'i', 'was', 'confronted', 'with', 'a', 'torturously', 'boring', 'book', 'filled', 'with', 'people', 'with', 'such', 'nonsensical', 'names', 'as', 'pip', 'and', 'magwitch', '.', 'yes', ',', 'it', "'", 's', 'a', 'classic', 'of', 'literature', ',', 'but', 'it', 'was', 'a', 'rather', 'dry', 'one', ',', 'shoved', 'down', 'my', 'young', 'throat', 'like', 'a', 'spoonful', 'of', 'bitter', 'medicine', '.', 'certainly', ',', 'the', 'experience', 'wasn', "'", 't', 'truly', 'that', 'bad', ',', 'but', ',', 'to', 'quote', 'ethan', 'hawke', 'in', 'the', 'latest', 'movie', 'adaptation', 'of', 'said', 'book', ',', '"', 'i', "'", 'm', 'not', 'going', 'to', 'tell', 'the', 'story', 'the', 'way', 'it', 'happened', '.', '.', '.', 'i', "'", 'll', 'gonna', 'te

In [56]:
all_words = nltk.FreqDist(all_words)

In [57]:
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [58]:
print(all_words["stupid"])

253


In [60]:
random.shuffle(documents)
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [61]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))



In [None]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]
print (featuresets)

## Naive Bayes Classifier

In [65]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

In [66]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [67]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 79.0


In [68]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =      9.8 : 1.0
                  annual = True              pos : neg    =      9.6 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.8 : 1.0
                  alicia = True              neg : pos    =      6.6 : 1.0
               atrocious = True              neg : pos    =      6.6 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0

## Saving Classifiers

In [69]:
import pickle

In [70]:
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [71]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

## Scikit-Learn Sklearn

In [75]:
from nltk.classify.scikitlearn import SklearnClassifier

### MultinomialNB in BernoulliNB

In [72]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [73]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set))

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

MultinomialNB accuracy percent: 0.81


<SklearnClassifier(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))>

BernoulliNB accuracy percent: 0.78


### Linearni modeli

In [74]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [87]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 88.0
Most Informative Features
                   sucks = True              neg : pos    =      9.8 : 1.0
                  annual = True              pos : neg    =      9.6 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.8 : 1.0
                  alicia = True              neg : pos    =      6.6 : 1.0
               atrocious = True              neg : pos    =      6.6 : 1.0
                  turkey 

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

MNB_classifier accuracy percent: 83.0


<SklearnClassifier(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))>

BernoulliNB_classifier accuracy percent: 79.0


<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))>

LogisticRegression_classifier accuracy percent: 83.0


<SklearnClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))>

SGDClassifier_classifier accuracy percent: 79.0


<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

SVC_classifier accuracy percent: 85.0


<SklearnClassifier(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))>

LinearSVC_classifier accuracy percent: 82.0


<SklearnClassifier(NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, nu=0.5, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False))>

NuSVC_classifier accuracy percent: 78.0


### Ansambli

In [88]:
from nltk.classify import ClassifierI
from statistics import mode

In [89]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [90]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

voted_classifier accuracy percent: 83.0
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 85.71428571428571
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 100.0
