In [1]:
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

Data Preprocessing

In [2]:
#text preprocesing 
#step 1 : read the file
with open('gutenberg_corpus.txt', 'r' , encoding='utf-8') as file:
    text  = file.read()

In [3]:
#tokenize and normalize
tokens = word_tokenize(text)
#remove punctuations
filtered_tokens = [word for word in tokens if any(char.isalnum() for char in word)]

In [4]:
def preprocess_word(word):
    # Remove punctuation within words
    word = ''.join(char for char in word if char not in string.punctuation)
    # Normalize by converting to lowercase
    word = word.lower()
    return word

In [5]:
#remove stop words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in filtered_tokens if word not in stop_words]

words  = [preprocess_word(word) for word in filtered_words]
words = [word.lower() for word in words]

In [6]:
def create_word_occurences_map(words):
    word_occurences = {}
    for word in words:
        if word in word_occurences:
            word_occurences[word] += 1
        else :
            word_occurences[word] = 1
    return word_occurences

In [7]:
def create_probability_map(word_occurrences_map):
    total_occurrences = sum(word_occurrences_map.values())
    probability_map = {word: occurrences / total_occurrences for word, occurrences in word_occurrences_map.items()}
    return probability_map

In [8]:
#things which we will use , final
word_occurrences = create_word_occurences_map(words)
word_probability = create_probability_map(word_occurrences)

BUILDING THE TRIE FOR AUTOCOMPLETE

In [9]:
class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end_of_word = False

In [10]:
class Trie:
    def __init__(self):
        self.root = TrieNode()
    
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
    
    def search(self, prefix):
        if not self.root.children:
            return []
        
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []
            node = node.children[char]
        return self._get_words_with_prefix(node, prefix)
    
    def _get_words_with_prefix(self, node, prefix):
        words = []
        if node.is_end_of_word:
            words.append(prefix)
        for char, child_node in node.children.items():
            words.extend(self._get_words_with_prefix(child_node, prefix + char))
        return words

In [11]:
words = list(word_occurrences.keys()) # words to insert in the trie

Insert the words in the trie

In [12]:
trie = Trie()
for word in words:
    trie.insert(word)

In [None]:
#find the suggestions
suggestions = trie.search('heav')
#build the suggestions probability map
suggestions_probability = {}
for sugg in suggestions:
    suggestions_probability[sugg] = word_probability[sugg]

sorted_suggested_words = sorted(suggestions_probability.items(), key=lambda x: x[1], reverse=True)
sorted_suggested_words

In [14]:
#to use inside the application
import pickle
with open('data.pkl' , 'wb') as f:
    pickle.dump(word_probability , f)
    pickle.dump(word_occurrences ,f )
    pickle.dump(trie,f)