In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt_tab')
nltk.download('stopwords')

#  Text Preprocessing

# Load CSV file
data = pd.read_csv('imdb_dataset.csv')
data.head()

# preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
  text = text.lower()  # Lowercase
  text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove punctuation and numbers
  tokens = word_tokenize(text) # Tokenize
  tokens = [word for word in tokens if word not in stop_words]  # Remove stopewords
  return tokens

# Apply to your review column
data['processed_review'] = data['review'].apply(preprocess_text)
data.head()

# Generate n-grams

# Helper function for n-grams
def generate_ngrams(tokens, n):
  return zip(*[tokens[i:] for i in range(n)])

# Count top N n-grams
def get_top_ngrams(corpus, ngram=1, top=10):
  all_ngrams = []
  for tokens in corpus:
    ngrams = generate_ngrams(tokens, ngram)
    all_ngrams += [' '.join(ngram) for ngram in ngrams]
  return Counter(all_ngrams).most_common(top)

# Get top 10 unigrams,bigrams,trigrams
top_unigrams = get_top_ngrams(data['processed_review'], ngram=1)
top_bigrams = get_top_ngrams(data['processed_review'], ngram=2)
top_trigrams = get_top_ngrams(data['processed_review'], ngram=3)

print("Top 10 Unigrams:", top_unigrams)
print("Top 10 Bigrams:", top_bigrams)
print("Top 10 Trigram:", top_trigrams)

# Word Frequency Visualization

all_words = [word for tokens in data['processed_review'] for word in tokens]
word_freq = Counter(all_words)
most_common_words = word_freq.most_common(20)

# Plot
words, freqs = zip(*word_freq)
plt.figure(figsize=(10,5))
sns.barplot(x=list(words), y=list(freqs), palatte='viridis')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 20 Most Frequent Words')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Bag-of-words vectorization

data['processed_review_str'] = data['processed_review'].apply(lambda x: ' '.join(x))
#Creat bag of words
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(data['processed_review_str'])

print("Shape of Sparse Matrix:", x.shape)
print("Sample feature Name:", vectorizer.get_feature_names_out()[:10])


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 10 Unigrams: [('br', 114890), ('movie', 83523), ('film', 74459), ('one', 51028), ('like', 38992), ('good', 28570), ('even', 24576), ('would', 24024), ('time', 23269), ('really', 22951)]
Top 10 Bigrams: [('br br', 14098), ('ever seen', 2528), ('br film', 2481), ('br movie', 2430), ('ive seen', 2203), ('special effects', 2145), ('dont know', 2056), ('itbr br', 2009), ('even though', 1940), ('one best', 1864)]
Top 10 Trigram: [('ive ever seen', 984), ('dont waste time', 366), ('worst movie ever', 365), ('one worst movies', 313), ('movie ever seen', 299), ('movie br br', 298), ('br br movie', 297), ('br br film', 297), ('dont get wrong', 269), ('new york city', 252)]


ValueError: not enough values to unpack (expected 2, got 1)