In [855]:
with open('text.txt', 'r', encoding="utf-8") as f:
    text = f.read()

In [856]:
text

'The master is a person of an excellent disposition and is remarkable in the ship for his gentleness and the mildness of his discipline. This circumstance, added to his well-known integrity and dauntless courage, made me very desirous to engage him. A youth passed in solitude, my best years spent under your gentle and feminine fosterage, has so refined the groundwork of my character that I cannot overcome an intense distaste to the usual brutality exercised on board ship: I have never believed it to be necessary, and when I heard of a mariner equally noted for his kindliness of heart and the respect and obedience paid to him by his crew, I felt myself peculiarly fortunate in being able to secure his services. I heard of him first in rather a romantic manner, from a lady who owes to him the happiness of her life. This, briefly, is his story. Some years ago he loved a young Russian lady of moderate fortune, and having amassed a considerable sum in prize-money, the father of the girl cons

In [857]:
def clean_text(text):
    ponctuation = ['.', ',', '!', '?', ':', ';']
    text = text.lower()
    text = ''.join([c for c in text if c not in ponctuation])
    return text

In [858]:
def number_of_words(text):
    return len(text.split())

In [859]:
def number_of_sentences(text):
    return len(text.split('.'))

In [860]:
def number_of_unique_words(text):
    text = clean_text(text)
    return len(set(text.split()))

In [861]:
def word_length(text):
    text = clean_text(text)
    words = text.split()
    max_length = max([len(word) for word in words])
    min_length = min([len(word) for word in words])
    average_length = sum([len(word) for word in words]) / len(words)
    return max_length, min_length, average_length

In [862]:
def vocabulary(text):
    text = text.lower()
    with_symbols = set(text)
    without_symbols = set([c for c in text if c.isalpha()])
    return with_symbols, without_symbols

In [863]:
def starting(text, char):
    text = clean_text(text)
    words = text.split()
    start = [word for word in words if word[0] == char]
    return start

In [864]:
def ending(text, char):
    text = clean_text(text)
    words = text.split()
    end = [word for word in words if word[-1] == char]
    return end

In [865]:
def containing(text, substing):
    text = clean_text(text)
    words = text.split()
    contain = [word for word in words if substing in word]
    return contain

In [866]:
def most_frequent(text):
    text = clean_text(text)
    words = text.split()
    return max(set(words), key = words.count)

In [867]:
def most_frequent_substring_length_4(text):
    text = clean_text(text)
    words = text.split()
    substring = [word[i:i+4] for word in words for i in range(len(word)-3)]
    return max(set(substring), key = substring.count)
    
    

In [868]:
print('Number of words: ', number_of_words(text))
print('Number of unique words: ', number_of_unique_words(text))
print('Number of sentences: ', number_of_sentences(text))
print('Word length: ', word_length(text))
print('Vocabulary with symbols: ', vocabulary(text)[0])
print('Vocabulary without symbols: ', vocabulary(text)[1])
print('Words starting with "a": ', starting(text, 'a'))
print('Words ending with "e": ', ending(text, 'e'))
print('Words containing "a": ', containing(text, 'ad'))
print('Most frequent character: ', most_frequent(text))
print('Most frequent substring: ', most_frequent_substring_length_4(text))

Number of words:  395
Number of unique words:  216
Number of sentences:  13
Word length:  (12, 1, 4.589873417721519)
Vocabulary with symbols:  {'y', 'n', '“', 'x', ':', 'k', 'q', '-', 'v', 't', ',', 'g', 'h', 'f', 'm', 'l', '”', 's', ' ', 'i', 'w', ';', 'b', 'r', 'e', 'p', 'u', 'd', 'o', 'z', '!', '.', 'a', 'c', '’'}
Vocabulary without symbols:  {'y', 'n', 'x', 'k', 'q', 'v', 't', 'g', 'h', 'f', 'm', 'l', 's', 'i', 'w', 'b', 'r', 'e', 'p', 'u', 'd', 'o', 'z', 'a', 'c'}
Words starting with "a":  ['a', 'an', 'and', 'and', 'added', 'and', 'a', 'and', 'an', 'and', 'a', 'and', 'and', 'able', 'a', 'a', 'ago', 'a', 'and', 'amassed', 'a', 'and', 'at', 'at', 'another', 'and', 'and', 'abandoned', 'already', 'a', 'and', 'according', 'a', 'as', 'as', 'a', 'and', 'a', 'attends', 'astonishing', 'and']
Words ending with "e":  ['the', 'remarkable', 'the', 'the', 'discipline', 'circumstance', 'courage', 'made', 'me', 'engage', 'solitude', 'gentle', 'feminine', 'fosterage', 'the', 'overcome', 'intense',

In [869]:
import regex as re

In [870]:
import re
class Corpus:
    def __init__(self, text):
        self.text = text

    def clean_text(self, text):
        ponctuation = ['.', ',', '!', '?', ':', ';', "“", "”"]
        self.text = self.text.lower()
        text = ''.join([c for c in text if c not in ponctuation])
        text = text.replace('“', '').replace('”', '')
        return text

    def number_of_words(self):
        return len(self.text.split())

    def number_of_sentences(self):
        return len(self.text.split('.'))

    def number_of_unique_words(self):
        return len(set(self.text.split()))

    def word_length(self):
        text = self.clean_text(self.text)
        words = text.split()
        max_length = max([len(word) for word in words])
        min_length = min([len(word) for word in words])
        average_length = sum([len(word) for word in words]) / len(words)
        return max_length, min_length, average_length

    def vocabulary(self):
        text = self.text.lower()
        with_symbols = set(text)
        without_symbols = set([c for c in text if c.isalpha()])
        return with_symbols, without_symbols

    def starting(self, char):
        text = self.clean_text(self.text)
        words = text.split()
        start = [word for word in words if word[0] == char]
        return start

    def ending(self, char):
        text = self.clean_text(self.text)
        words = text.split()
        end = [word for word in words if word[-1] == char]
        return end

    def containing(self, substing):
        text = self.clean_text(self.text)
        words = text.split()
        contain = [word for word in words if substing in word]
        return contain

    def most_frequent(self):
        text = self.clean_text(self.text)
        words = text.split()
        return max(set(words), key=words.count)

    def most_frequent_substring_length_4(self):
        text = self.clean_text(self.text)
        words = text.split()
        substring = [
            word[i:i + 4] for word in words for i in range(len(word) - 3)
        ]
        return max(set(substring), key=substring.count)

    def find_substring(self, substring):
        substring_length = len(substring)
        position = [(True,
                     i) if self.text[i:i + substring_length] == substring else
                    (False, None)
                    for i in range(len(self.text) - substring_length)]
        return position

    def most_and_least_occurences(self):
        text = self.clean_text(self.text)
        words = list(text.replace(' ', ''))

        most_frequent = max(set(words), key=words.count)
        least_frequent = min(set(words), key=words.count)
        return most_frequent, least_frequent

    def is_anagram(self, word1, word2):
        return sorted(word1) == sorted(word2)

    def sentences(self):
        # Exclude decimal numbers, domain names, and abbreviations from the split pattern
        split_pattern = r'(?<!\d\.)\s*[.!?]+\s+(?!\w)|(?<=\w)\.(?=\w)(?<!etc\.)'
        sentences = re.split(split_pattern, self.text)
        return sentences


In [871]:
a = Corpus(text)

In [872]:
a.most_and_least_occurences()

('e', 'B')