In [1]:
import nltk
import pandas as pd
import re
from nltk.tokenize import word_tokenize

In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15531\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\15531\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the IMDB dataset (adjust the path as necessary)
df = pd.read_csv('IMDB Dataset.csv') 

In [4]:
# Assuming 'review' contains the text and 'sentiment' labels (positive/negative)
positive_tokens = []
negative_tokens = []

In [5]:
# Set of stopwords to filter out
unwanted_tokens = set(nltk.corpus.stopwords.words('english'))

In [6]:
print("----start----")
# Iterate over the dataset and process each review
for _, row in df.iterrows():
    review = row['review']
    sentiment = row['sentiment']
    
    # Normalize review: lowercasing and removing non-word characters
    cleaned_review = re.sub(r'<.*?>', ' ', review.lower())
     
     # Tokenize the cleaned review
    tokens = word_tokenize(cleaned_review)
     
     # Filter out unwanted tokens
    filtered_tokens = [token for token in tokens if not token.isnumeric() and token not in unwanted_tokens and token not in ["''", '``','*','',',','.','br','<','>','!','(',')','--','?'] ]
     
     # Add tokens to respective lists based on sentiment
    if sentiment == 'positive':
        positive_tokens.extend(filtered_tokens)
    elif sentiment == 'negative':
        negative_tokens.extend(filtered_tokens)

----start----


In [7]:
from nltk.util import ngrams
from nltk import FreqDist


In [8]:
# get token_size and vocab_size
token_size = len(positive_tokens) + len(negative_tokens)
print(f"token_size = {token_size}")
token_all = positive_tokens + negative_tokens
token_freq = FreqDist(token_all)
vocab_size = len(token_freq)
print(f"vocab_size = {vocab_size}")

token_size = 6210698
vocab_size = 151068


In [9]:
# Assuming positive_tokens is a list of tokenized positive words
bigrams_pos = ngrams(positive_tokens, 2)
bigram_freq_pos = FreqDist(bigrams_pos)
print(f"bigram_freq_pos.most_common(10):{bigram_freq_pos.most_common(10)}")

bigram_freq_pos.most_common(10):[(('ca', "n't"), 2858), (('film', "'s"), 1681), (('one', 'best'), 1654), (("'ve", 'seen'), 1332), (("n't", 'know'), 1238), (('wo', "n't"), 1215), (('even', 'though'), 1092), (('ever', 'seen'), 959), (('movie', "'s"), 945), (('could', "n't"), 942)]


In [10]:
# Assuming negative_tokens is a list of tokenized negative words
bigrams_neg = ngrams(negative_tokens, 2)
bigram_freq_neg = FreqDist(bigrams_neg)
print(f"bigram_freq_neg.most_common(10):{bigram_freq_neg.most_common(10)}")

bigram_freq_neg.most_common(10):[(('ca', "n't"), 4170), (("n't", 'even'), 2231), (('could', "n't"), 2092), (('ever', 'seen'), 1722), (("n't", 'know'), 1674), (('film', "'s"), 1438), (("'ve", 'seen'), 1434), (('waste', 'time'), 1420), (('special', 'effects'), 1402), (('would', "n't"), 1341)]


In [11]:
# Assuming positive_tokens is a list of tokenized positive words
trigrams_pos = ngrams(positive_tokens, 3)
trigram_freq_pos = FreqDist(trigrams_pos)
print(f"trigram_freq_pos.most_common(10):{trigram_freq_pos.most_common(10)}")

trigram_freq_pos.most_common(10):[(("'ve", 'ever', 'seen'), 388), (('ca', "n't", 'help'), 222), (('new', 'york', 'city'), 193), (('ca', "n't", 'wait'), 172), (('world', 'war', 'ii'), 158), (('one', 'best', 'movies'), 143), (('based', 'true', 'story'), 133), (('ca', "n't", 'get'), 131), (('one', 'best', 'films'), 129), (('ca', "n't", 'say'), 126)]


In [12]:
# Assuming negative_tokens is a list of tokenized negative words

trigrams_neg = ngrams(negative_tokens, 3)
trigram_freq_neg = FreqDist(trigrams_neg)
print(f"trigram_freq_neg.most_common(10):{trigram_freq_neg.most_common(10)}")

trigram_freq_neg.most_common(10):[(("'ve", 'ever', 'seen'), 707), (("n't", 'waste', 'time'), 387), (('ca', "n't", 'believe'), 368), (('worst', 'movie', 'ever'), 358), (('one', 'worst', 'movies'), 309), (('ca', "n't", 'even'), 242), (('movie', 'ever', 'seen'), 241), (('worst', 'movies', 'ever'), 204), (("n't", 'make', 'sense'), 199), (("n't", 'get', 'wrong'), 188)]


In [13]:
from nltk import ngrams, FreqDist
import math
def get_freq_prob(word1, word2, word3):
    bigram = list(ngrams(token_all, 2))
    trigram = list(ngrams(token_all, 3))
    word1_word2_count = FreqDist(bigram)[word1, word2]
    word1_word2_word3_count = FreqDist(trigram)[word1,word2,word3]
    res = (1 + word1_word2_word3_count)/(vocab_size + word1_word2_count)
    print(f'{word3}| {word1}, {word2} = {word1_word2_count}, {word1_word2_word3_count}, {res}')

In [16]:
# Print the trigram probabilities
get_freq_prob('based', 'true', 'story')
get_freq_prob('new', 'york', 'city')
get_freq_prob('worst', 'movie', 'ever')
get_freq_prob('one', 'worst', 'movies')
get_freq_prob('hi','i', 'am')


story|based,true=213,173,0.0011501774842842128
city|new,york=1299,273,0.001798289655896618
ever|worst,movie=728,365,0.0024111307280824264
movies|one,worst=983,313,0.0020650965794371625
am|hi,i=0,0,6.619535573384171e-06
