# Ngrams

An n-gram is a contiguous sequence of 'n' items from a given sample of text or speech. Widely used in natural language processing for tasks like text prediction, language modeling, and machine translation, here we are looking for the folkloristic version of an ngram, the proverb

In [4]:
# IMPORTS
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter
import string

# Read file into one big text:
with open('responses-2-100.txt', 'r') as f:
    lines = f.readlines()

texts = lines[1:-1]  # Exclude first and last line

print(len(texts))
for text in texts[-5:]:
    print(text)

97
"Silence can be a powerful statement amidst the noise of social media.",

"Reposts reveal not just your interests but your values.",

"Block out the noise that threatens your peace of mind.",

"Filters can distort reality, but authenticity shines through.",

"Be the signal in a sea of noise.",



In [11]:
def preprocess_text(text):
    """Tokenizes text and converts to lowercase, removing punctuation."""
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation and word.isalnum()]
    return tokens

# Get the tokenized list from the corpus
word_tokens = preprocess_text(one_big_string)
print(f"Total number of tokens after preprocessing: {len(word_tokens)}")

Total number of tokens after preprocessing: 4986


In [22]:
def get_all_ngrams_in_range(tokens, min_n, max_n):
    """
    Generates a list of all n-grams for n within the specified range.
    """
    all_ngrams = []
    # Loop from min_n up to and including max_n
    for n in range(min_n, max_n + 1):
        # The ngrams function yields tuples of n tokens
        n_gram_generator = ngrams(tokens, n)
        # Convert the generator results to a list and extend the master list
        all_ngrams.extend(list(n_gram_generator))
    return all_ngrams

In [23]:
# Arbitrary numbers until they are not:
MIN_N = 8
MAX_N = 20

# Get all n-grams
long_ngrams = get_all_ngrams_in_range(word_tokens, MIN_N, MAX_N)

# The n-grams are returned as tuples of tokens, e.g., ('the', 'complexity', 'of', 'modern', 'life', 'often', 'masks', 'the')
# We can join them to view them as phrases:
print(f"\nExample of a generated {MIN_N}-gram (first one):")
print(' '.join(long_ngrams[0]))


Example of a generated 8-gram (first one):
do compare your to someone else highlight reel


In [24]:
# Count the frequency of each unique n-gram
ngram_counts = Counter(long_ngrams)

# Define how many top results you want to see
TOP_K = 10 

# Get the top K most common n-grams
most_common_ngrams = ngram_counts.most_common(TOP_K)

print(f"\n--- Top {TOP_K} Most Frequent N-grams ({MIN_N} to {MAX_N} tokens) ---")
for n_gram_tuple, count in most_common_ngrams:
    # Join the tuple tokens into a single string phrase
    phrase = ' '.join(n_gram_tuple)
    n_length = len(n_gram_tuple)
    print(f"[{n_length}-gram, Count: {count}]: \"{phrase}\"")


--- Top 10 Most Frequent N-grams (8 to 20 tokens) ---
[8-gram, Count: 15]: "always believe that something wonderful is about to"
[8-gram, Count: 15]: "believe that something wonderful is about to happen"
[9-gram, Count: 15]: "always believe that something wonderful is about to happen"
[8-gram, Count: 14]: "keep your face always toward the sunshine and"
[8-gram, Count: 14]: "your face always toward the sunshine and shadows"
[8-gram, Count: 14]: "face always toward the sunshine and shadows will"
[8-gram, Count: 14]: "always toward the sunshine and shadows will fall"
[8-gram, Count: 14]: "toward the sunshine and shadows will fall behind"
[8-gram, Count: 14]: "the sunshine and shadows will fall behind you"
[8-gram, Count: 14]: "happiness is not the absence of problems it"
