In [2]:
## Importing libraries
import nltk
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [5]:
## Creating the recipe corpus from the data directory
corpus_root = "./data"
reviews = PlaintextCorpusReader(corpus_root, '.*')
travel_corpus = reviews.words('travelblog.txt')

---

# 1. length

In [12]:
## Calculate the length of the original recipe corpus
travel_corpus_len = len(travel_corpus)
print(f"Original corpus length: {travel_corpus_len}")

## Create a corpus containing only alphabetic words and convert them to lowercase
travel_alpha_corpus = [word.lower() for word in travel_corpus if word.isalpha()]
travel_alpha_corpus_len = len(travel_alpha_corpus)
print(f"Alphabet-only and lower-cased corpus length: {travel_alpha_corpus_len}")

Original corpus length: 9922
Alphabet-only and lower-cased corpus length: 8515






# 2. lexical diversity



In [13]:
## Function for calculating the lexical diversity of a text
def lexical_diversity(text):
    sorted_words = sorted(w.lower() for w in text)
    unique_sorted_words = sorted(set(w.lower() for w in text))
    return len(set(unique_sorted_words)) / len(sorted_words)

In [14]:
## Print the lexical diversities
print(f"lexical diversity for the travel corpus: {lexical_diversity(travel_corpus)}")
print(f"lexical diversity for the alphabet only recipe corpus: {lexical_diversity(travel_alpha_corpus)}")

lexical diversity for the travel corpus: 0.19018343075992744
lexical diversity for the alphabet only recipe corpus: 0.21397533763945978


# 3. top 10 most frequent words + their counts

In [15]:
## Calculate the frequency distributions for the alphabet-only recipe corpus
freq_dist = FreqDist(travel_alpha_corpus)

## Store the 10 most frequent words from each frequency distribution
most_freq_words = freq_dist.most_common(10)

In [16]:
## Print the 10 most frequent words
print("10 most frequent words in the alphabet-only travel corpus:")
for word, count in most_freq_words:
    print(f"word: '{word}', count: {count}")

10 most frequent words in the alphabet-only travel corpus:
word: 'to', count: 293
word: 'the', count: 286
word: 'you', count: 258
word: 'and', count: 223
word: 'i', count: 208
word: 'a', count: 205
word: 'in', count: 183
word: 'of', count: 155
word: 'your', count: 143
word: 'that', count: 113


# 4. words with atleast 10 characters long + their counts

In [17]:
## Store the words and their counts that are at least 10 characters long
long_words_with_counts = [(word, count) for word, count in freq_dist.items() if len(word) >= 10]

## Print the words that are at least 10 characters long and their counts
print("Words that are at least 10 characters long in the alphabet-only travel corpus:")
for word, count in long_words_with_counts:
    print(f"word: '{word}', count: {count}")

Words that are at least 10 characters long in the alphabet-only travel corpus:
word: 'shipwrecked', count: 2
word: 'antarctica', count: 1
word: 'lookalikes', count: 1
word: 'originally', count: 1
word: 'establishing', count: 1
word: 'continents', count: 1
word: 'developing', count: 2
word: 'conventional', count: 1
word: 'influential', count: 1
word: 'everything', count: 6
word: 'businesses', count: 1
word: 'opportunity', count: 2
word: 'consulting', count: 1
word: 'honeymoons', count: 1
word: 'preparation', count: 3
word: 'independently', count: 3
word: 'consolidate', count: 1
word: 'environment', count: 1
word: 'fearmongering', count: 1
word: 'destination', count: 18
word: 'everywhere', count: 2
word: 'frequently', count: 2
word: 'extensions', count: 1
word: 'possibilities', count: 1
word: 'independent', count: 2
word: 'philosophy', count: 1
word: 'environmentally', count: 1
word: 'economically', count: 1
word: 'responsible', count: 2
word: 'offsetting', count: 1
word: 'photographing'

# 5. the longest sentence

In [18]:
## Retrieve the sentences from the recipe corpus
recipe_sentences = reviews.sents('travelblog.txt')

## Find the longest sentence
longest_sentence = []
for sentence in recipe_sentences:
    if len(longest_sentence) < len(sentence):
        longest_sentence = sentence 

## Join the words of the longest sentence into a single string for printing
joined_longest_sentence = ' '.join(longest_sentence)

## Print the longest sentence along with the word count
print(f"longest sentence:\n{joined_longest_sentence}")
print()
print(f"number of words: {len(longest_sentence)}")

longest sentence:
Check in Regularly It ’ s a good idea for at least one designated friend or family member to have a copy of your itinerary in advance : your flight numbers , your accommodation , and a general schedule of where you ’ ll be on which dates , as well as information on your travel insurance , credit cards , and a bank account number .

number of words: 67


# 6. stemmed version of longest sentence

In [19]:
## Initialize the stemmers
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

## Stem the words of the longest sentence using both stemmers
port_stemmed_sentence = []
lanc_stemmed_sentence = []
for word in longest_sentence:
    port_stemmed_sentence.append(porter_stemmer.stem(word))
    lanc_stemmed_sentence.append(lancaster_stemmer.stem(word))

## Join the stemmed words into single strings for printing
joined_port_stemmed_sentence = ' '.join(port_stemmed_sentence)
joined_lanc_stemmed_sentence = ' '.join(lanc_stemmed_sentence)

## Print the both stemmed longest sentences
print(f"porter stemmed longest sentence:\n{joined_port_stemmed_sentence}")
print()
print(f"lancaster stemmed longest sentence:\n{joined_lanc_stemmed_sentence}")

porter stemmed longest sentence:
check in regularli it ’ s a good idea for at least one design friend or famili member to have a copi of your itinerari in advanc : your flight number , your accommod , and a gener schedul of where you ’ ll be on which date , as well as inform on your travel insur , credit card , and a bank account number .

lancaster stemmed longest sentence:
check in regul it ’ s a good ide for at least on design friend or famy memb to hav a cop of yo itin in adv : yo flight numb , yo accommod , and a gen schedule of wher you ’ ll be on which dat , as wel as inform on yo travel ins , credit card , and a bank account numb .
