In [1]:
!pip install kagglehub



In [None]:
# # Install and Import Libraries

# import kagglehub
# import os
# import pandas as pd
# import nltk
# from nltk.tokenize import word_tokenize
# from collections import defaultdict, Counter
# import re

# # --- IMPORTANT FIX FOR NLTK PUNKT_TAB ---
# # Ensure you have the necessary NLTK data.
# # Modern NLTK versions (3.8.2+) use 'punkt_tab' instead of 'punkt'.
# # The try-except block now checks for 'punkt_tab'.
# try:
#     nltk.data.find('tokenizers/punkt_tab')
#     print("'punkt_tab' NLTK resource found.")
# except LookupError: # This is the correct exception for missing NLTK data in newer versions
#     print("Downloading 'punkt_tab' NLTK resource...")
#     nltk.download('punkt_tab')
#     print("'punkt_tab' downloaded successfully.")
# except Exception as e: # Catch any other unexpected exceptions
#     print(f"An unexpected error occurred during NLTK data check: {e}")
#     print("Attempting to download 'punkt' as a fallback, though 'punkt_tab' is preferred.")
#     try:
#         nltk.download('punkt')
#         print("'punkt' downloaded successfully as a fallback.")
#     except Exception as e_fallback:
#         print(f"Fallback 'punkt' download also failed: {e_fallback}")


# print("Libraries imported successfully.")

'punkt_tab' NLTK resource found.
Libraries imported successfully.


In [None]:
# # Download latest version
# path = kagglehub.dataset_download("ahmadseloabadi/whatsapp-app-reviews-from-google-play-store")

# print("Path to dataset files:", path)

Path to dataset files: /home/sagemaker-user/.cache/kagglehub/datasets/ahmadseloabadi/whatsapp-app-reviews-from-google-play-store/versions/2


In [1]:
# --- REPLACE the KaggleHub download and CSV loading with the same corpus as nextword.ipynb ---

import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import re

# Ensure NLTK punkt is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Read the same corpus as nextword.ipynb
df = pd.read_parquet("train-00000-of-00010.parquet", engine="pyarrow")

# Use the same number of lines as in nextword.ipynb
numlines = 10000  # Set to match nextword.ipynb
corpus = df["text"].dropna().tolist()[:numlines]

# Preprocess: lowercase, remove punctuation, tokenize
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    return word_tokenize(text)

# Apply preprocessing
df_content = pd.DataFrame({'content': corpus})
df_content['processed_content'] = df_content['content'].apply(preprocess_text)

print(df_content.head())
print(f"Number of reviews: {len(df_content)}")

                                             content  \
0  usually , he would be tearing around the livin...   
1  but just one look at a minion sent him practic...   
2  that had been megan 's plan when she got him d...   
3  he 'd seen the movie almost by mistake , consi...   
4  she liked to think being surrounded by adults ...   

                                   processed_content  
0  [usually, he, would, be, tearing, around, the,...  
1  [but, just, one, look, at, a, minion, sent, hi...  
2  [that, had, been, megan, s, plan, when, she, g...  
3  [he, d, seen, the, movie, almost, by, mistake,...  
4  [she, liked, to, think, being, surrounded, by,...  
Number of reviews: 10000


In [2]:
# Define Preprocessing Function and Apply

def preprocess_text(text):
    """
    Cleans and tokenizes text: lowercasing, removing punctuation, and splitting into words.
    """
    if pd.isna(text): # Handle potential NaN values if any slipped through dropna
        return []
    text = str(text).lower() # Ensure it's a string and lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation (keep alphanumeric and whitespace)
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing to all review content
print("Starting text preprocessing...")
df_content['processed_content'] = df_content['content'].apply(preprocess_text)
print("Text preprocessing complete.")

# Display a sample of processed content
print("\nSample of processed content:")
print(df_content['processed_content'].head())

Starting text preprocessing...
Text preprocessing complete.

Sample of processed content:
0    [usually, he, would, be, tearing, around, the,...
1    [but, just, one, look, at, a, minion, sent, hi...
2    [that, had, been, megan, s, plan, when, she, g...
3    [he, d, seen, the, movie, almost, by, mistake,...
4    [she, liked, to, think, being, surrounded, by,...
Name: processed_content, dtype: object


In [3]:
# Cell 4: Build the Bigram Language Model

print("Building bigram model...")
bigram_model = defaultdict(lambda: defaultdict(int))

for tokens in df_content['processed_content']:
    if len(tokens) > 1: # Ensure there are at least two words to form a bigram
        for i in range(len(tokens) - 1):
            current_word = tokens[i]
            next_word = tokens[i+1]
            bigram_model[current_word][next_word] += 1
print("Bigram model built.")

# Optional: Print some sample bigram counts
print("\nSample bigram counts for 'the':")
if 'the' in bigram_model:
    sorted_the_bigrams = sorted(bigram_model['the'].items(), key=lambda item: item[1], reverse=True)
    print(sorted_the_bigrams[:5])
else:
    print("'the' not found in model (unlikely).")

Building bigram model...
Bigram model built.

Sample bigram counts for 'the':
[('door', 79), ('way', 66), ('last', 57), ('same', 56), ('first', 51)]


In [4]:
# Cell 5: Define Prediction Function

def predict_next_word(input_word, model, top_n=3):
    """
    Predicts the next most likely word(s) based on the input_word using the bigram model.
    """
    input_word = input_word.lower()
    if input_word in model:
        # Sort predictions by frequency in descending order
        predictions = sorted(model[input_word].items(), key=lambda item: item[1], reverse=True)
        # Filter out empty strings or non-alpha words that might have slipped through
        clean_predictions = [word for word, count in predictions if word.strip() and word.isalpha()]
        return clean_predictions[:top_n]
    else:
        return ["No prediction (word not in vocabulary or very rare)."]

print("Prediction function defined.")

Prediction function defined.


In [5]:
# Cell 6: Example Usage and Testing

print("\n--- Next Word Prediction Examples ---")

# Example 1
input_word_1 = "the"
predictions_1 = predict_next_word(input_word_1, bigram_model)
print(f"If you type '{input_word_1}', the next word could be: {predictions_1}")

# Example 2
input_word_2 = "app"
predictions_2 = predict_next_word(input_word_2, bigram_model)
print(f"If you type '{input_word_2}', the next word could be: {predictions_2}")

# Example 3
input_word_3 = "good"
predictions_3 = predict_next_word(input_word_3, bigram_model)
print(f"If you type '{input_word_3}', the next word could be: {predictions_3}")

# Example 4 (word not in model - might be rare or misspelled)
input_word_4 = "extraordinary"
predictions_4 = predict_next_word(input_word_4, bigram_model)
print(f"If you type '{input_word_4}', the next word could be: {predictions_4}")

# Example 5 (common phrase start)
input_word_5 = "i"
predictions_5 = predict_next_word(input_word_5, bigram_model)
print(f"If you type '{input_word_5}', the next word could be: {predictions_5}")

# Example 6 (another common phrase start)
input_word_6 = "this"
predictions_6 = predict_next_word(input_word_6, bigram_model)
print(f"If you type '{input_word_6}', the next word could be: {predictions_6}")


--- Next Word Prediction Examples ---
If you type 'the', the next word could be: ['door', 'way', 'last']
If you type 'app', the next word could be: ['No prediction (word not in vocabulary or very rare).']
If you type 'good', the next word could be: ['to', 'for', 'idea']
If you type 'extraordinary', the next word could be: ['No prediction (word not in vocabulary or very rare).']
If you type 'i', the next word could be: ['m', 'was', 'do']
If you type 'this', the next word could be: ['is', 'was', 'time']


In [6]:
# Cell 6: Optimized Example Usage and Testing with a loop

print("\n--- Next Word Prediction Examples (Optimized Loop) ---")

# Define a list of input words to test
test_words = [
    "the",
    "app",
    "good",
    "extraordinary", # Word likely not in vocabulary
    "i",
    "this",
    "whatsapp", # Another common word in reviews
    "update",   # A word related to app changes
    "messages"  # A word related to app function
]

for word in test_words:
    predictions = predict_next_word(word, bigram_model)
    print(f"If you type '{word}', the next word could be: {predictions}")

print("\n--- End of Examples ---")


--- Next Word Prediction Examples (Optimized Loop) ---
If you type 'the', the next word could be: ['door', 'way', 'last']
If you type 'app', the next word could be: ['No prediction (word not in vocabulary or very rare).']
If you type 'good', the next word could be: ['to', 'for', 'idea']
If you type 'extraordinary', the next word could be: ['No prediction (word not in vocabulary or very rare).']
If you type 'i', the next word could be: ['m', 'was', 'do']
If you type 'this', the next word could be: ['is', 'was', 'time']
If you type 'whatsapp', the next word could be: ['No prediction (word not in vocabulary or very rare).']
If you type 'update', the next word could be: ['No prediction (word not in vocabulary or very rare).']
If you type 'messages', the next word could be: ['first', 'were']

--- End of Examples ---


In [26]:
# Cell 6: Optimized Example Usage and Evaluation from Corpus

print("\n--- Next Word Prediction Evaluation from Corpus ---")

# We'll collect a sample of (current_word, actual_next_word) pairs from the corpus
# to test our model.
# To avoid testing on every single bigram (which can be very large and slow),
# let's randomly sample some bigrams from our processed content.

# Flatten all processed tokens into a single list
all_tokens = [word for sublist in df_content['processed_content'] for word in sublist]

# Create actual bigrams from the flattened list
corpus_bigrams = []
if len(all_tokens) > 1:
    for i in range(len(all_tokens) - 1):
        corpus_bigrams.append((all_tokens[i], all_tokens[i+1]))

print(f"Total bigrams in corpus: {len(corpus_bigrams)}")

# Sample a smaller number of bigrams for testing to keep it manageable
# Adjust num_samples as needed for performance vs. comprehensiveness
num_samples = min(50, len(corpus_bigrams)) # Test up to 50 samples or all if less than 50
import random
test_samples = random.sample(corpus_bigrams, num_samples) if num_samples > 0 else []

print(f"Testing with {len(test_samples)} random bigrams from the corpus.")

correct_in_top_1 = 0
correct_in_top_3 = 0 # Check if the actual next word is in the top 3 predictions
total_predictions = 0

if not test_samples:
    print("No test samples available. Corpus might be too small or empty after preprocessing.")
else:
    for current_word, actual_next_word in test_samples:
        total_predictions += 1
        predictions = predict_next_word(current_word, bigram_model, top_n=3)

        print(f"\nInput: '{current_word}'")
        print(f"Actual Next Word: '{actual_next_word}'")
        print(f"Model Predictions (Top 3): {predictions}")

        if actual_next_word in predictions:
            correct_in_top_3 += 1
            if actual_next_word == predictions[0]: # Check if it's the very first prediction
                correct_in_top_1 += 1
            print("Status: CORRECT (Actual word in top predictions)")
        else:
            print("Status: INCORRECT (Actual word NOT in top predictions)")

    # Calculate and print accuracy
    accuracy_top_1 = (correct_in_top_1 / total_predictions) * 100 if total_predictions > 0 else 0
    accuracy_top_3 = (correct_in_top_3 / total_predictions) * 100 if total_predictions > 0 else 0

    print("\n--- Prediction Summary ---")
    print(f"Total predictions made: {total_predictions}")
    print(f"Correct (Top 1 Prediction): {correct_in_top_1} / {total_predictions} ({accuracy_top_1:.2f}%)")
    print(f"Correct (Any in Top 3 Predictions): {correct_in_top_3} / {total_predictions} ({accuracy_top_3:.2f}%)")

print("\n--- End of Evaluation ---")


--- Next Word Prediction Evaluation from Corpus ---
Total bigrams in corpus: 110075
Testing with 50 random bigrams from the corpus.

Input: 'anymore'
Actual Next Word: 'dropping'
Model Predictions (Top 3): ['the', 'erectile', 'not']
Status: INCORRECT (Actual word NOT in top predictions)

Input: 'this'
Actual Next Word: 'one'
Model Predictions (Top 3): ['is', 'was', 'time']
Status: INCORRECT (Actual word NOT in top predictions)

Input: 'in'
Actual Next Word: 'place'
Model Predictions (Top 3): ['the', 'a', 'her']
Status: INCORRECT (Actual word NOT in top predictions)

Input: 'especially'
Actual Next Word: 'if'
Model Predictions (Top 3): ['since', 'if', 'after']
Status: CORRECT (Actual word in top predictions)

Input: 'and'
Actual Next Word: 'then'
Model Predictions (Top 3): ['i', 'then', 'she']
Status: CORRECT (Actual word in top predictions)

Input: 'is'
Actual Next Word: 'the'
Model Predictions (Top 3): ['nt', 'that', 'a']
Status: INCORRECT (Actual word NOT in top predictions)

Input: