In [109]:
#https://www.geeksforgeeks.org/python-lemmatization-with-nltk/#:~:text=Using%20PCA%20Implementation-,Python%20%7C%20Lemmatization%20with%20NLTK,similar%20meaning%20to%20one%20word.
#https://arxiv.org/pdf/2006.00804.pdf
#https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/
#https://www.datacamp.com/community/tutorials/machine-learning-models-api-python
#https://www.jitsejan.com/python-and-javascript-in-flask.html


import nltk
#nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#parts of speech
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD

import re, string
import random

import pickle

from textblob import TextBlob

In [110]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [111]:
positive_tweets

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing 

In [112]:
#TwitterCorpusReader object has no attribute word_tokenize
pos_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
neg_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
#print(pos_tag(pos_tweet_tokens[0]))

In [113]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        # stemming with lemmatization needs a part of speech parameter, so that's why we're finding POS
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

#print(lemmatize_sentence(tweet_tokens[0]))

In [114]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [115]:
stop_words = stopwords.words('english')
#print(remove_noise(tweet_tokens[0], stop_words))

In [116]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in pos_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in neg_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [141]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [142]:
freq_dist_pos = FreqDist(all_pos_words)
#print(freq_dist_pos.most_common(10))

In [143]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [144]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [145]:
positive_dataset

[({'#followfriday': True,
   'top': True,
   'engage': True,
   'member': True,
   'community': True,
   'week': True,
   ':)': True},
  'Positive'),
 ({'hey': True,
   'james': True,
   'odd': True,
   ':/': True,
   'please': True,
   'call': True,
   'contact': True,
   'centre': True,
   '02392441234': True,
   'able': True,
   'assist': True,
   ':)': True,
   'many': True,
   'thanks': True},
  'Positive'),
 ({'listen': True,
   'last': True,
   'night': True,
   ':)': True,
   'bleed': True,
   'amazing': True,
   'track': True,
   'scotland': True},
  'Positive'),
 ({'congrats': True, ':)': True}, 'Positive'),
 ({'yeaaaah': True,
   'yippppy': True,
   'accnt': True,
   'verify': True,
   'rqst': True,
   'succeed': True,
   'get': True,
   'blue': True,
   'tick': True,
   'mark': True,
   'fb': True,
   'profile': True,
   ':)': True,
   '15': True,
   'day': True},
  'Positive'),
 ({'one': True,
   'irresistible': True,
   ':)': True,
   '#flipkartfashionfriday': True},
  'P

In [146]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9963333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2100.6 : 1.0
                      :) = True           Positi : Negati =   1646.2 : 1.0
                     sad = True           Negati : Positi =     35.0 : 1.0
                follower = True           Positi : Negati =     21.5 : 1.0
                followed = True           Negati : Positi =     15.6 : 1.0
                    blog = True           Positi : Negati =     14.2 : 1.0
                      aw = True           Negati : Positi =     13.8 : 1.0
                    glad = True           Positi : Negati =     13.6 : 1.0
                    miss = True           Negati : Positi =     12.8 : 1.0
                   didnt = True           Negati : Positi =     11.8 : 1.0
None


In [147]:
pickle.dump(classifier, open('tweets_text.sav', 'wb'))

In [127]:
def clean_tweet(tweet): 
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", tweet).split()) 

In [128]:
neutral_dataset = []
    
for custom_tweet in text:
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    analysis = TextBlob(clean_tweet(custom_tweet))

    
    if (analysis.sentiment.polarity == 0.0):
        neutral_dataset.append((dict([token, True] for token in custom_tokens), "Neutral"))
    
    else: 
        if classifier.classify(dict([token, True] for token in custom_token)) == "Positive":
            positive_dataset.append((dict([token, True] for token in custom_tokens), "Positive"))
        else:
            negative_dataset.append((dict([token, True] for token in custom_tokens), "Negative"))

In [138]:
len(positive_dataset)

14751

In [130]:
negative_dataset

[({'hopeless': True, 'tmr': True, ':(': True}, 'Negative'),
 ({'everything': True,
   'kid': True,
   'section': True,
   'ikea': True,
   'cute': True,
   'shame': True,
   "i'm": True,
   'nearly': True,
   '19': True,
   '2': True,
   'month': True,
   ':(': True},
  'Negative'),
 ({'heart': True, 'slide': True, 'waste': True, 'basket': True, ':(': True},
  'Negative'),
 ({'“': True,
   'hate': True,
   'japanese': True,
   'call': True,
   'ban': True,
   ':(': True,
   '”': True},
  'Negative'),
 ({'dang': True,
   'start': True,
   'next': True,
   'week': True,
   'work': True,
   ':(': True},
  'Negative'),
 ({'oh': True, 'god': True, 'baby': True, 'face': True, ':(': True},
  'Negative'),
 ({'make': True, 'smile': True, ':(': True}, 'Negative'),
 ({'work': True,
   'neighbour': True,
   'motor': True,
   'asked': True,
   'say': True,
   'hat': True,
   'update': True,
   'search': True,
   ':(': True},
  'Negative'),
 ({':(': True, 'sialan': True}, 'Negative'),
 ({'athabasca'

In [133]:
dataset = positive_dataset + negative_dataset + neutral_dataset

random.shuffle(dataset)

train_data = dataset[:25000]
test_data = dataset[25000:]

In [134]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.8992
Most Informative Features
                      :( = True           Negati : Positi =   7255.9 : 1.0
              tommy_colc = True           Neutra : Positi =    498.9 : 1.0
                      rt = True           Neutra : Negati =    451.4 : 1.0
                      :) = True           Positi : Negati =    405.6 : 1.0
              inequality = True           Neutra : Positi =    303.9 : 1.0
                   claim = True           Neutra : Negati =    216.2 : 1.0
                  thanks = True           Positi : Neutra =    185.2 : 1.0
                   times = True           Neutra : Positi =    180.6 : 1.0
               financial = True           Neutra : Negati =    174.2 : 1.0
                  rather = True           Neutra : Negati =    167.3 : 1.0
None


In [135]:
pickle.dump(classifier, open('tweets_text.sav', 'wb'))