In [29]:
#!pip install nltk==3.3
import nltk
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/jano/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /home/jano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jano/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/jano/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [78]:
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import classify, NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import re, string

stop_words = stopwords.words('english')

### Tokenizing data

In [17]:
pos_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(pos_tweet_tokens[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


### Normalizing data

In [26]:
print(pos_tag(pos_tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [36]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lem_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'): #noun
            sen_type = 'n'
        elif tag.startswith('VB'): #verb
            sen_type = 'v'
        else:
            sen_type = 'a'
        lem_sentence.append(lemmatizer.lemmatize(word, sen_type))
    return lem_sentence

print(lemmatize(pos_tweet_tokens[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


### Removing Noise

In [41]:
def remove_noise(tokens):
    cleaned_tokens = []
    for token in tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    
    return cleaned_tokens

print(remove_noise(pos_tweet_tokens[0]))

['#followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)']


### Helpers

In [42]:
def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

In [57]:
def get_dictionary(tokens_list):
    for tokens in tokens_list:
        yield dict([token, True] for token in tokens)

### Building a model

In [61]:
pos_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
neg_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

pos_cleaned_tokens_list = []
neg_cleaned_tokens_list = []

for tokens in pos_tweet_tokens:
    pos_cleaned_tokens_list.append(lemmatize(remove_noise(tokens)))

for tokens in neg_tweet_tokens:
    neg_cleaned_tokens_list.append(lemmatize(remove_noise(tokens)))
    
pos_dict = get_dictionary(pos_cleaned_tokens_list)
neg_dict = get_dictionary(neg_cleaned_tokens_list)

pos_dataset = [(single_dict, "Positive") for single_dict in pos_dict]
neg_dataset = [(single_dict, "Negative") for single_dict in neg_dict]

dataset = pos_dataset + neg_dataset
train_data = dataset[:8000]
test_data = dataset[8000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is: ", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(15))


Accuracy is:  0.992
Most Informative Features
                      :( = True           Negati : Positi =   2957.9 : 1.0
                      :) = True           Positi : Negati =    850.1 : 1.0
                     x15 = True           Negati : Positi =     39.4 : 1.0
                follower = True           Positi : Negati =     31.4 : 1.0
                     sad = True           Negati : Positi =     27.6 : 1.0
                     ugh = True           Negati : Positi =     20.6 : 1.0
                    glad = True           Positi : Negati =     17.4 : 1.0
                   shame = True           Negati : Positi =     15.0 : 1.0
                  ignore = True           Negati : Positi =     13.9 : 1.0
                  friday = True           Positi : Negati =     13.7 : 1.0
                    tire = True           Negati : Positi =     13.6 : 1.0
              appreciate = True           Positi : Negati =     12.6 : 1.0
               community = True           Positi : Neg

### Usage

In [92]:
test_data = "It is terrible"
tokens = lemmatize(remove_noise(word_tokenize(test_data)))
print(test_data, classifier.classify(dict([token, True] for token in tokens)))

It is terrible Negative


### Saving classifier

In [87]:
import pickle
f = open('sentiment_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()