<a href="https://colab.research.google.com/github/kipsangmarion/NLTK-twitter-sentiment-analysis/blob/main/Twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter Sentiment Analysis

I will use NLTK to perform sentiment analysis on the twitter dataset from NLTK

## 1. Import necessary libraries

In [26]:
import nltk
import random
from nltk.corpus import twitter_samples
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## 2. Load the twitter samples dataset

In [27]:
# Access positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# Combine positive and negative tweets
all_tweets = [(tweet, 'positive') for tweet in positive_tweets] + [(tweet, 'negative') for tweet in negative_tweets]

# Shuffle the combined list of tweets
random.shuffle(all_tweets)

## 3. Tokenize the dataset

In [28]:
# Tokenize each tweet
tokenized_tweets = [(nltk.word_tokenize(tweet), sentiment) for (tweet, sentiment) in all_tweets]

## 4. Extract features

In [29]:
# Flatten the list of tokens
all_words = [word.lower() for tweet, _ in tokenized_tweets for word in tweet]

# Get the most common words as features
word_features = nltk.FreqDist(all_words).most_common(2000)

# Extract just the words from the tuples
word_features = [word for word, _ in word_features]

# Define a function to extract features (bag-of-words model)
def extract_features(tweet_tokens):
    tweet_words = set(tweet_tokens)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in tweet_words)
    return features

# Create feature sets
feature_sets = [(extract_features(tweet), sentiment) for tweet, sentiment in tokenized_tweets]

# Print an example feature set
print(feature_sets[0])

({'contains(:)': True, 'contains(@)': True, 'contains(()': False, 'contains())': True, 'contains(i)': False, 'contains(!)': False, 'contains(you)': False, 'contains(.)': True, 'contains(#)': False, 'contains(to)': True, 'contains(the)': False, 'contains(,)': False, 'contains(a)': False, 'contains(-)': False, 'contains(and)': False, 'contains(it)': False, 'contains(my)': False, 'contains(http)': False, 'contains(?)': False, 'contains(for)': False, 'contains(me)': True, 'contains(is)': False, 'contains(in)': False, "contains('s)": False, 'contains(so)': False, "contains(n't)": False, 'contains(have)': False, 'contains(of)': False, 'contains(that)': True, 'contains(;)': False, 'contains(d)': False, 'contains(on)': False, 'contains(this)': False, 'contains(but)': False, 'contains(&)': False, 'contains(do)': False, "contains('m)": False, 'contains(https)': False, 'contains(be)': False, 'contains(we)': False, 'contains(thanks)': False, 'contains(your)': True, 'contains(...)': False, 'contain

## 5. Split data into a training and testing set

In [30]:
from sklearn.model_selection import train_test_split

# Split feature sets into training and testing sets
train_set, test_set = train_test_split(feature_sets, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print("Training set size:", len(train_set))
print("Testing set size:", len(test_set))

Training set size: 8000
Testing set size: 2000


## 6. Modeling

### 6.1 Naive Bayes classifier

In [31]:
NBclassifier = nltk.NaiveBayesClassifier.train(train_set)

print("Naive Bayes Accuracy: ", (nltk.classify.accuracy(NBclassifier, test_set)))
NBclassifier.show_most_informative_features(10)

Naive Bayes Accuracy:  0.994
Most Informative Features
             contains()) = True           positi : negati =     62.8 : 1.0
             contains(() = True           negati : positi =     62.8 : 1.0
           contains(sad) = True           negati : positi =     25.7 : 1.0
          contains(miss) = True           negati : positi =     17.7 : 1.0
          contains(glad) = True           positi : negati =     16.4 : 1.0
       contains(arrived) = True           positi : negati =     15.7 : 1.0
       contains(welcome) = True           positi : negati =     13.7 : 1.0
         contains(hurts) = True           negati : positi =     12.9 : 1.0
          contains(lost) = True           negati : positi =     12.9 : 1.0
     contains(followers) = True           positi : negati =     11.5 : 1.0


### 6.2 Modeling with sk-learn

In [32]:
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB

MNBclassifier = SklearnClassifier(MultinomialNB())
MNBclassifier.train(train_set)
print("Multinomial Naive Bayes Accuracy: ", (nltk.classify.accuracy(MNBclassifier, test_set)))

Multinomial Naive Bayes Accuracy:  0.973
