## Using Text Blob


In [15]:
from textblob import TextBlob

text_1 = "The movie was so awesome."
text_2 = "The food here tastes terrible."
text_4 = "Its an fine okayish phone"
text_3 = "This was a helpful example but I would prefer another one"

#Determining the Polarity 
p_1 = TextBlob(text_1).sentiment.polarity
p_3 = TextBlob(text_3).sentiment.polarity

#Determining the Subjectivity
s_1 = TextBlob(text_1).sentiment.subjectivity
s_3 = TextBlob(text_3).sentiment.subjectivity

print("Polarity of Text 1 is", p_1)
print("Polarity of Text 2 is", p_3)
print("Subjectivity of Text 1 is", s_1)
print("Subjectivity of Text 2 is", s_3)

Polarity of Text 1 is 1.0
Polarity of Text 2 is 0.0
Subjectivity of Text 1 is 1.0
Subjectivity of Text 2 is 0.0


## Using VADER

In [16]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
text_1 = "The book was a perfect balance between wrtiting style and plot."
text_2 =  "The pizza tastes terrible."
text_3 = "Its an fine okayish phone"
text_3 = "This was a helpful example but I would prefer another one"
sent_1 = sentiment.polarity_scores(text_1)
sent_2 = sentiment.polarity_scores(text_2)
sent_3 = sentiment.polarity_scores(text_3)
print("Sentiment of text 1:", sent_1)
print("Sentiment of text 2:", sent_2)
print("Sentiment of text 3:", sent_3)

Sentiment of text 1: {'neg': 0.0, 'neu': 0.73, 'pos': 0.27, 'compound': 0.5719}
Sentiment of text 2: {'neg': 0.508, 'neu': 0.492, 'pos': 0.0, 'compound': -0.4767}
Sentiment of text 3: {'neg': 0.0, 'neu': 0.84, 'pos': 0.16, 'compound': 0.2263}


## Using Bag of Words Vectorization-Based Models

In [6]:
# #Loading the Dataset
# import pandas as pd
# data = pd.read_csv('Finance_data.csv')
# #Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.tokenize import RegexpTokenizer
# token = RegexpTokenizer(r'[a-zA-Z0-9]+')
# cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
# text_counts = cv.fit_transform(data['sentences'])
# #Splitting the data into trainig and testing
# from sklearn.model_selection import train_test_split
# X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['feedback'], test_size=0.25, random_state=5)
# #Training the model
# from sklearn.naive_bayes import MultinomialNB
# MNB = MultinomialNB()
# MNB.fit(X_train, Y_train)
# #Caluclating the accuracy score of the model
# from sklearn import metrics
# predicted = MNB.predict(X_test)
# accuracy_score = metrics.accuracy_score(predicted, Y_test)
# print("Accuracuy Score: ",accuracy_score)

## Using LSTM-Based Models

In [None]:
#Importing necessary libraries
import nltk
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split 
#Loading the dataset
data = pd.read_csv('Finance_data.csv')
#Pre-Processing the text 
def cleaning(df, stop_words):
    df['sentences'] = df['sentences'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    # Replacing the digits/numbers
    df['sentences'] = df['sentences'].str.replace('d', '')
    # Removing stop words
    df['sentences'] = df['sentences'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # Lemmatization
    df['sentences'] = df['sentences'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    return df
stop_words = stopwords.words('english')
data_cleaned = cleaning(data, stop_words)
#Generating Embeddings using tokenizer
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(data_cleaned['verified_reviews'].values)
X = tokenizer.texts_to_sequences(data_cleaned['verified_reviews'].values)
X = pad_sequences(X)
#Model Building
model = Sequential()
model.add(Embedding(500, 120, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352, activation='LeakyReLU'))
model.add(Dense(3, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())
#Model Training
model.fit(X_train, y_train, epochs = 20, batch_size=32, verbose =1)
#Model Testing
model.evaluate(X_test,y_test)

## Using Transformer-Based Models

In [5]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
data = ["It was the best of times.", "t was the worst of times."]
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)"tf_model.h5";:   0%|          | 0.00/268M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9994569420814514},
 {'label': 'NEGATIVE', 'score': 0.9987302422523499}]

In [7]:
data = ["Bitcoin should be transparent and it isn’t as easy as people thinks it is, there are so many strategies to be learnt and unfolded about Bitcoin trading"]
sentiment_pipeline(data)

[{'label': 'NEGATIVE', 'score': 0.9987401366233826}]

In [18]:
data = [" back ground music waste."," Unlable hear properly.", "good speech."]
sentiment_pipeline(data)

[{'label': 'NEGATIVE', 'score': 0.9997841715812683},
 {'label': 'NEGATIVE', 'score': 0.9415776133537292},
 {'label': 'POSITIVE', 'score': 0.999841570854187}]

In [19]:
data = ["If she's asking for it, she can just ask for it"]
sentiment_pipeline(data)

[{'label': 'NEGATIVE', 'score': 0.9972597360610962}]

In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Create an instance of the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Define a function to clean the text and tokenize it
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [w for w in words if not w in stop_words]
    # Join the words back into a string
    text = ' '.join(words)
    return text

# Define a function to perform sentiment analysis on a given text
def analyze_sentiment(text):
    # Preprocess the text
    text = preprocess_text(text)
    # Analyze the sentiment using the SentimentIntensityAnalyzer
    sentiment = sia.polarity_scores(text)
    return sentiment

# Example usage
text = "I love driving my new car! It's so much fun."
sentiment = analyze_sentiment(text)
print(sentiment)

text_1 = "The book was a perfect balance between wrtiting style and plot."
text_2 =  "The pizza tastes terrible."
text_3 = "Its an fine okayish phone"
sent_1 = analyze_sentiment(text_1)
sent_2 = analyze_sentiment(text_2)
sent_3 = analyze_sentiment(text_3)
print("Sentiment of text 1:", sent_1)
print("Sentiment of text 2:", sent_2)
print("Sentiment of text 3:", sent_3)

{'neg': 0.0, 'neu': 0.391, 'pos': 0.609, 'compound': 0.8313}
Sentiment of text 1: {'neg': 0.0, 'neu': 0.575, 'pos': 0.425, 'compound': 0.5719}
Sentiment of text 2: {'neg': 0.608, 'neu': 0.392, 'pos': 0.0, 'compound': -0.4767}
Sentiment of text 3: {'neg': 0.0, 'neu': 0.526, 'pos': 0.474, 'compound': 0.2023}


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [20]:
import pickle
#define an empty dictionary
l = []

# load the pickle contents
with open ('comments1.pkl','rb') as pick:
    l.append(pickle.load(pick))
# print(l)
for i in l[0]:
    print(i[0], end='\n')
    print(analyze_sentiment(i[0]), end = '\n\n')

The plus is awesome but Apple should have at least included 90Hz refresh rate. It would have improved the sales.
{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.802}

It amazes me the fact that despite having same os, same battery and a less energy consuming display this phone battery endurance is significantly worse compared to same size battery iPhone 13/14 max. Basically Apple has done this intentionally, there is no other reason.
{'neg': 0.103, 'neu': 0.731, 'pos': 0.166, 'compound': 0.228}

Love this phone 3 days of having and the battery last long ! I had the 11pro for 3 years it’s was all messed up and cracked so I had to get another phone ! I was going to get the pro max but they was out of stock I’m impatient and just wanted a fresh screen !
{'neg': 0.153, 'neu': 0.6, 'pos': 0.246, 'compound': 0.5826}

This phone covers a niche audience where they want a vanilla 14 in a bigger form factor without the need for the Pro features. This is a better attempt than the Mini phon

In [8]:
import requests

# make a GET request to the Instagram page
response = requests.get("https://www.instagram.com/rvcjinsta")

# check if the request was successful
if response.status_code == 200:
    # extract the content of the page
    content = response.content
    
    # search for the data you want to scrape, for example the user's followers count
    start = content.find("\"edge_followed_by\":{\"count\":") + 27
    end = content.find("},\"followed_by_viewer\"")
    followers = content[start:end]
    
    # print the followers count
    print("Followers:", followers)
else:
    print("Request failed with status code", response.status_code)

TypeError: argument should be integer or bytes-like object, not 'str'

In [15]:
import requests

# make a GET request to the Instagram page
response = requests.get("https://www.instagram.com/rvcjindia")

# check if the request was successful
if response.status_code == 200:
    # extract the content of the page
    content = response.content
    
    # search for the data you want to scrape, for example the user's followers count
    start = content.find("\"edge_followed_by\":{\"count\":") + 27
    end = content.find("},\"followed_by_viewer\"")
    followers = content[start:end]
    
    # print the followers count
    print("Followers:", followers)
else:
    print("Request failed with status code", response.status_code)


ProxyError: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /rvcjindia (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000211AA9484F0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond')))

In [13]:
! pip install praw

Collecting praw
  Downloading praw-7.6.1-py3-none-any.whl (188 kB)
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.5.1-py3-none-any.whl (55 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.6.1 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.5.1


In [14]:
import praw
import pandas as pd

reddit_read_only = praw.Reddit(client_id="",		 # your client id
							client_secret="",	 # your client secret
							user_agent="")	 # your user agent


subreddit = reddit_read_only.subreddit("redditdev")

# Display the name of the Subreddit
print("Display Name:", subreddit.display_name)

# Display the title of the Subreddit
print("Title:", subreddit.title)

# Display the description of the Subreddit
print("Description:", subreddit.description)


  from .rate_limit import RateLimiter
  from .subreddit import Subreddit
  from ._core import WebSocket, getdefaulttimeout


Display Name: redditdev


RequestException: error with request Invalid return character or leading space in header: User-Agent

In [11]:
import twint

c = twint.Config()

c.Search = ['Taylor Swift']       # topic
c.Limit = 500      # number of Tweets to scrape
c.Store_csv = True       # store tweets in a csv file
c.Output = "taylor_swift_tweets.csv"     # path to csv file

twint.run.Search(c)

RuntimeError: This event loop is already running

In [5]:
import pandas as pd
df = pd.read_csv('taylor_swift_tweets.csv')

  return _abc_subclasscheck(cls, subclass)


FileNotFoundError: [Errno 2] No such file or directory: 'taylor_swift_tweets.csv'

In [6]:
df['tweet']

NameError: name 'df' is not defined

In [21]:
import nltk
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [31]:
from nltk.corpus import twitter_samples

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

print(tweet_tokens)

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [40]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
from nltk.tag import pos_tag
from nltk.corpus import twitter_samples

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [38]:
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

print(lemmatize_sentence(tweet_tokens[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [39]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [41]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

print(remove_noise(tweet_tokens[0], stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [45]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [44]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


In [53]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [55]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[]


In [57]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import re, string, random

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

if __name__ == "__main__":

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]
Accuracy is: 0.9923333333333333
Most Informative Features
                      :) = True           Positi : Negati =    992.6 : 1.0
                 welcome = True           Positi : Negati =     36.7 : 1.0
                follower = True           Positi : Negati =     35.4 : 1.0
                     sad = True           Negati : Positi =     30.0 : 1.0
                     bam = True           Positi : Negati =     22.8 : 1.0
                     x15 = True           Negati : Positi =     17.8 : 1.0
                     idk = True           Negati : Positi =     13.8 : 1.0
                congrats = True           Positi : Negati =     12.9 : 1.0
                    glad = True           Positi : Negati =     12.5 : 1.0
                   let's = True           Positi : Negati =     12.2 : 1.0
None
I ordered just once from TerribleCo, 

In [58]:
0.9963333333333333

0.9963333333333333

In [3]:
from nltk.corpus import twitter_samples
print (twitter_samples.fileids())
'''
Output:

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
'''

pos_tweets = twitter_samples.strings('positive_tweets.json')
print (len(pos_tweets)) # Output: 5000

neg_tweets = twitter_samples.strings('negative_tweets.json')
print (len(neg_tweets)) # Output: 5000

all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
print (len(all_tweets)) # Output: 20000

for tweet in pos_tweets[:5]:
    print (tweet)
'''
Output:

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days
'''

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
5000
5000
20000
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days


'\nOutput:\n\n#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)\n@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!\n@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!\n@97sides CONGRATS :)\nyeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days\n'

In [4]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

for tweet in pos_tweets[:5]:
    print (tweet_tokenizer.tokenize(tweet))

['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 'you', 'in', 'scotland', '?', '!']
['congrats', ':)']
['yeaaah', 'yipppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days']


In [5]:
import string
import re

from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

from nltk.tokenize import TweetTokenizer

# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)

def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)

    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []   
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print (clean_tweets(custom_tweet))
'''
Output:

['hello', 'great', 'day', 'good', 'morning']
'''

print (pos_tweets[5])
'''
Output:

@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM
'''

print (clean_tweets(pos_tweets[5]))

['hello', 'great', 'day', 'good', 'morn']
@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM
['one', 'irresist', 'flipkartfashionfriday']


In [7]:
# feature extractor function
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words) 
    return words_dictionary

custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
print (bag_of_words(custom_tweet))

# '''
# Output:
# {'great': True, 'good': True, 'morning': True, 'hello': True, 'day': True}
# '''

# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos')) 

# negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))

print (len(pos_tweets_set), len(neg_tweets_set)) # Output: (5000, 5000)

{'hello': True, 'great': True, 'day': True, 'good': True, 'morn': True}
5000 5000


In [8]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)

test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]

print(len(test_set),  len(train_set)) # Output: (2000, 8000)

2000 8000


In [10]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)

accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765

print (classifier.show_most_informative_features(10))   
# '''
# Output:
# 0.723
# Most Informative Features
#                      via = True              pos : neg    =     37.0 : 1.0
#                     glad = True              pos : neg    =     25.0 : 1.0
#                      sad = True              neg : pos    =     22.6 : 1.0
#                       aw = True              neg : pos    =     21.7 : 1.0
#                      bam = True              pos : neg    =     21.0 : 1.0
#                      x15 = True              neg : pos    =     19.7 : 1.0
#                  appreci = True              pos : neg    =     17.7 : 1.0
#                    arriv = True              pos : neg    =     15.0 : 1.0
#                      ugh = True              neg : pos    =     14.3 : 1.0
#                   justin = True              neg : pos    =     13.0 : 1.0
# '''

0.723
Most Informative Features
                     sad = True              neg : pos    =     40.2 : 1.0
                     via = True              pos : neg    =     36.3 : 1.0
                     bam = True              pos : neg    =     25.0 : 1.0
                     x15 = True              neg : pos    =     19.0 : 1.0
                  welcom = True              pos : neg    =     16.7 : 1.0
                   arriv = True              pos : neg    =     15.0 : 1.0
                   didnt = True              neg : pos    =     15.0 : 1.0
                    glad = True              pos : neg    =     14.2 : 1.0
               goodnight = True              pos : neg    =     13.0 : 1.0
                    poor = True              neg : pos    =     12.2 : 1.0
None


In [11]:
custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_tweet_set = bag_of_words(custom_tweet)
print (classifier.classify(custom_tweet_set)) # Output: neg
# Negative tweet correctly classified as negative

# probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: neg
print (prob_result.prob("neg")) # Output: 0.941844352481
print (prob_result.prob("pos")) # Output: 0.0581556475194


custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_tweet_set = bag_of_words(custom_tweet)

print (classifier.classify(custom_tweet_set)) # Output: pos
# Positive tweet correctly classified as positive

# probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: pos
print (prob_result.prob("neg")) # Output: 0.00131055449755
print (prob_result.prob("pos")) # Output: 0.998689445502

neg
<ProbDist with 2 samples>
neg
0.931815829423131
0.06818417057687025
pos
<ProbDist with 2 samples>
pos
0.0003673552891362668
0.999632644710862


In [19]:
from collections import defaultdict

actual_set = defaultdict(set)
predicted_set = defaultdict(set)

actual_set_cm = []
predicted_set_cm = []

for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)

    predicted_label = classifier.classify(feature)

    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)
    
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix

print('pos precision:', precision(actual_set['pos'], predicted_set['pos'])) # Output: pos precision: 0.762896825397
print('pos recall:', recall(actual_set['pos'], predicted_set['pos'])) # Output: pos recall: 0.769
print('pos F-measure:', f_measure(actual_set['pos'], predicted_set['pos'])) # Output: pos F-measure: 0.76593625498
print()
print('neg precision:', precision(actual_set['neg'], predicted_set['neg'])) # Output: neg precision: 0.767137096774
print('neg recall:', recall(actual_set['neg'], predicted_set['neg'])) # Output: neg recall: 0.761
print('neg F-measure:', f_measure(actual_set['neg'], predicted_set['neg'])) # Output: neg F-measure: 0.7640562249

pos precision: 0.7095864661654135
pos recall: 0.755
pos F-measure: 0.7315891472868217

neg precision: 0.7382478632478633
neg recall: 0.691
neg F-measure: 0.7138429752066116
