# N-grams

Definition: A contiguous sequence of items from a sequence of tokens.

In [2]:
# Use sklearn to create bag of words model with uni and bi-grams

# Import the CountVectorizer class from sklearn's feature_extraction.text module
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents for demonstration
corpus = [
    "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
    "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
    "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",
    "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.",
    "Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
]

# Create a CountVectorizer instance with specific parameters
# lowercase=True: Convert all characters to lowercase before tokenizing
# ngram_range=(1, 2): Create both unigrams (single words) and bigrams (pairs of consecutive words)
vectorizer = CountVectorizer(max_features=40, lowercase=True, ngram_range=(1, 2))

# Fit the vectorizer to the corpus and transform the documents into a matrix of token counts
# fit_transform() method does both fitting (learning the vocabulary) and transforming (creating the document-term matrix) in one step
X = vectorizer.fit_transform(corpus)

# Convert the sparse matrix to a dense numpy array for easier viewing
# Note: For large datasets, it's often better to work with the sparse matrix directly to save memory
X.toarray()

# Get the feature names (words and n-grams) that correspond to the columns in the resulting matrix
# These represent the vocabulary learned by the vectorizer
print(vectorizer.get_feature_names_out())


['ad' 'dolor' 'dolore' 'in' 'incididunt ut' 'ipsum' 'ipsum dolor' 'irure'
 'irure dolor' 'labore' 'labore et' 'laboris' 'laboris nisi' 'laborum'
 'lorem' 'lorem ipsum' 'magna' 'magna aliqua' 'minim' 'minim veniam'
 'mollit' 'mollit anim' 'nisi' 'nisi ut' 'non' 'non proident' 'nostrud'
 'nostrud exercitation' 'nulla' 'nulla pariatur' 'occaecat'
 'occaecat cupidatat' 'officia' 'ut' 'ut enim' 'ut labore' 'velit'
 'velit esse' 'veniam' 'veniam quis']
