In [1]:
from nltk.corpus import twitter_samples as sample
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.tokenize import word_tokenize

import nltk
import string
import random

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
noun, verb = 'NN', "VB"

In [3]:
# download data sample
nltk.download('twitter_samples')

# get tokenizer
nltk.download('punkt')

# get a lexical db
nltk.download('wordnet')

# get pos tagger
nltk.download('averaged_perceptron_tagger')

# get stopwords
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/hum/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hum/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hum/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/hum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
positive_tweets = sample.tokenized('positive_tweets.json')
negative_tweets = sample.tokenized('negative_tweets.json')

In [5]:
tokens = sample.tokenized('positive_tweets.json')

In [6]:
print(len(tokens), tokens[0])

5000 ['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [7]:
def is_valid_token(token: str) -> bool:
    if len(token) == 0:
        return False
    return token not in string.punctuation and token.lower() not in stop_words

# normalize a list of tokens
def lemmatize_tokens(tokens):
    lemma_tokens = []
    
    for token, tag in pos_tag(tokens):
        if not is_noise(token):
            continue
        if tag.startswith(noun):
            pos = 'n'
        elif tag.startswith(verb):
            pos = 'v'
        else:
            pos = 'a'
            
        token = lemmatizer.lemmatize(token, pos)
        if is_valid_token(token):
            lemma_tokens.append(lemmatizer.lemmatize(token, pos))
    return lemma_tokens

# ad hoc function to remove noise from text
def is_noise(token, stopwords = ()) -> str:
    if 'http://' in token or 'https://' in token:
        return ''
    
    if '@' in token or '_' in token:
        return ''
    
    return token

In [8]:
# original vs the dictionary form
print(tokens[0], "\n\n", lemmatize_tokens(tokens[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'] 

 ['#FollowFriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [9]:
# get cleaned tokens
pos_tokens = []
neg_tokens = []

In [10]:
# get data
for tokens in positive_tweets:
    pos_tokens.append(lemmatize_tokens(tokens))

for tokens in negative_tweets:
    neg_tokens.append(lemmatize_tokens(tokens))

In [11]:
print(positive_tweets[500], "\n\n", pos_tokens[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht'] 

 ['Dang', 'rad', '#fanart', ':D']


In [12]:
# get all words in a list
def get_all_words(token_list):
    for tokens in token_list:
        for token in tokens:
            yield token

In [13]:
all_pos_words = get_all_words(pos_tokens)

In [14]:
# most common words
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':D', 658), ('follow', 337), ('...', 290), ('love', 244), ('day', 235), ('get', 234), ('u', 228), ('like', 220)]


In [15]:
# convert list to dict
def get_dict_from_list(token_list):
    for tokens in token_list:
        yield dict([token, True] for token in tokens)

In [16]:
# get dicts of positive and negative tweets for the model
pos_tokens_dict = get_dict_from_list(pos_tokens)
neg_tokens_dict = get_dict_from_list(neg_tokens)

In [17]:
# splitting the dataset
pos_dataset = [(value, "Positive") for value in pos_tokens_dict]
neg_dataset = [(value, "Negative") for value in neg_tokens_dict]

dataset = pos_dataset + neg_dataset

In [18]:
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [19]:
# use Naive Bayes Classifier
# More at: https://en.wikipedia.org/wiki/Naive_Bayes_classifier
classifier = NaiveBayesClassifier.train(train_data)
print('Accuracy: ', classify.accuracy(classifier, test_data))

Accuracy:  0.9963333333333333


In [20]:
print(classifier.show_most_informative_features(10))

Most Informative Features
                      :( = True           Negati : Positi =   2076.4 : 1.0
                      :) = True           Positi : Negati =   1636.1 : 1.0
                     sad = True           Negati : Positi =     33.6 : 1.0
                     See = True           Positi : Negati =     26.8 : 1.0
                follower = True           Positi : Negati =     23.2 : 1.0
                  THANKS = True           Negati : Positi =     23.2 : 1.0
                  FOLLOW = True           Negati : Positi =     22.5 : 1.0
                    MUCH = True           Negati : Positi =     21.2 : 1.0
                     x15 = True           Negati : Positi =     19.1 : 1.0
                   Thank = True           Positi : Negati =     18.9 : 1.0
None


In [21]:
# let's test some tweets
tweets = [
    "Castaways ain't even the best backyardigans song 😭",
    "I don't get men who wouldn't date a girl with an Onlyfans.",
    "Great start to the day",
    "i know my worth bro i’m just dumb as hell",
    "my coworker keeps looking at my boobs lol",
    "Here’s a baby elephant loving life.",
    "Succesfully wasted almost 2 years of our life because of Covid19.",
    "When you feel depressed remember there are a million cells in your body and all they do is care about you.",
]

In [22]:
for tweet in tweets:
    # normalize tweet and remove noise
    tokens = lemmatize_tokens(word_tokenize(tweet))
    
    # classify
    result = classifier.classify((dict([token, True] for token in tokens)))
    print(result, '->', tweet)

Negative -> Castaways ain't even the best backyardigans song 😭
Positive -> I don't get men who wouldn't date a girl with an Onlyfans.
Positive -> Great start to the day
Positive -> i know my worth bro i’m just dumb as hell
Negative -> my coworker keeps looking at my boobs lol
Positive -> Here’s a baby elephant loving life.
Negative -> Succesfully wasted almost 2 years of our life because of Covid19.
Positive -> When you feel depressed remember there are a million cells in your body and all they do is care about you.
