#### https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

In [None]:
# Uncomment and run the below command to download all the datasets and the relevant resources in one go
# !python -m nltk.downloader all

In [1]:
from nltk.corpus import twitter_samples
# negative_tweets.json: 5000 tweets with negative sentiments
# positive_tweets.json: 5000 tweets with positive sentiments
# tweets.20150430-223406.json: 20000 tweets with no sentiments

## Tokenizing the Data

In [36]:
positive_tweet = twitter_samples.strings('positive_tweets.json')
negative_tweet = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

pos_tokens = twitter_samples.tokenized('positive_tweets.json')
neg_tokens = twitter_samples.tokenized('negative_tweets.json')
pos_tokens[0]

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

## Normalizing the Data

1. Stemming is a process of removing affixes from a word. Stemming, working with only simple verb forms, is a heuristic process that removes the ends of words.
    
2. Lemmatization normalizes a word with the context of vocabulary and morphological analysis of words in text.   
    

In [10]:
from nltk.tag import pos_tag

pos_tag(pos_tokens[0])

[('#FollowFriday', 'JJ'),
 ('@France_Inte', 'NNP'),
 ('@PKuchly57', 'NNP'),
 ('@Milipol_Paris', 'NNP'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

In [32]:
from nltk.stem.wordnet import WordNetLemmatizer

def normalized_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    normalized_tokens = []
#     print(tokens)
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        normalized_tokens.append(lemmatizer.lemmatize(word, pos))
    return normalized_tokens

normalized_tokens(pos_tokens[0])        

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'be',
 'top',
 'engage',
 'member',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

## Removing Noise from the Data

In [35]:
import re, string
from nltk.corpus import stopwords

eng_stop_words = stopwords.words('english')

def remove_noise(tweet_tokens, stop_words):
    cleaned_tokens = []
    lemmatizer = WordNetLemmatizer()
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        token = lemmatizer.lemmatize(token, pos)
        
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens
   
remove_noise(pos_tokens[0], eng_stop_words)

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']

In [58]:
pos_cleaned_tokens = []
neg_cleaned_tokens = []

for tokens in pos_tokens:
    pos_cleaned_tokens.append(remove_noise(tokens, eng_stop_words))

for tokens in neg_tokens:
    neg_cleaned_tokens.append(remove_noise(tokens, eng_stop_words))

pos_cleaned_tokens[0]

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']

## Determining Word Density

In [51]:
# or you could tokenize it and send it together to FreqDist method
def get_all_words(cleaned_tweets):
    for tweet in cleaned_tweets:
        for token in tweet:
            yield token

all_pos_words = get_all_words(pos_cleaned_tokens)
all_pos_words

<generator object get_all_words at 0x000001D73634DCC8>

In [52]:
from nltk import FreqDist

pos_freq = FreqDist(all_pos_words)
pos_freq.most_common(20)

[(':)', 3691),
 (':-)', 701),
 (':d', 658),
 ('thanks', 388),
 ('follow', 357),
 ('love', 333),
 ('...', 290),
 ('good', 283),
 ('get', 263),
 ('thank', 253),
 ('u', 245),
 ('day', 242),
 ('like', 229),
 ('see', 195),
 ('happy', 192),
 ("i'm", 183),
 ('great', 175),
 ('hi', 173),
 ('go', 167),
 ('back', 163)]

## Preparing Data for the Model

In [67]:
def get_tweets_with_labels(cleaned_tweets):
    for tweet in cleaned_tweets:
        yield dict([token, True] for token in tweet)
            
pos_cleaned_labels = get_tweets_with_labels(pos_cleaned_tokens)
neg_cleaned_labels = get_tweets_with_labels(neg_cleaned_tokens)

In [68]:
import random

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in pos_cleaned_labels]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in neg_cleaned_labels]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]
train_data[0]

({'sure': True, ':)': True}, 'Positive')

## Building and Testing the Model

In [69]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9953333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2081.7 : 1.0
                follower = True           Positi : Negati =     32.9 : 1.0
                     sad = True           Negati : Positi =     32.9 : 1.0
                     bam = True           Positi : Negati =     19.9 : 1.0
                  arrive = True           Positi : Negati =     18.6 : 1.0
                     x15 = True           Negati : Positi =     16.0 : 1.0
               community = True           Positi : Negati =     15.3 : 1.0
                 welcome = True           Positi : Negati =     15.2 : 1.0
                     ugh = True           Negati : Positi =     14.0 : 1.0
                    blog = True           Positi : Negati =     12.7 : 1.0
None
