In [1]:
from os import listdir
from pickle import dump as save_to_pickle_file
from typing import Dict, List
# from pandas import DataFrame
from methods import do_case_folding, INDEX_DIR, ARTICLES_DIR,remove_stop_words, preform_stemming_using_porter
from copy import deepcopy


In [2]:

def create_language_model_from_collection(collection: List[str]) -> Dict[str, float]:
    """
        generate language model from collection

    Args:
        collection (List[str]): [description]

    Returns:
        Dict[str, float]: [description]
    """
    language_model = {}
    counter = 0

    for doc in collection:
        splitted=doc.split()
        counter += len(splitted)
        for word in splitted:
            if word not in language_model:
                language_model[word] = 1
            else:
                language_model[word] += 1
    # print(f"how much words in the collection: {sum(language_model.values())}")

    # calculate probability of a word in the collection
    for word in language_model:
        language_model[word] = round(language_model[word] / counter, 7)
        # language_model[word] = language_model[word] / counter
        # language_model[word] / counter
    # print(f"amount of words in collection:{counter}")
    return language_model


In [3]:
res: Dict[str, float] = None
articels = listdir(ARTICLES_DIR)


In [4]:
collection = []
for art in articels:
    with open(f"{ARTICLES_DIR}/{art}", "r") as file:
        collection.append(file.read())


In [5]:
len(collection)

48

In [6]:
    # """
    # 1. create language model from the entire collection
    # 2. remove stop words and create new language model
    # 3. preform case folding and create new language model
    # 4. preform stemming and create new language model
    # """

In [7]:
# language_model for the entire collection, with all the words
basic_language_model=create_language_model_from_collection(deepcopy(collection))

In [8]:
len(collection)

48

In [9]:
def remove_stop_words_from_collection(collection: List[str]) -> List[str]:
    """
    remove stop words from the collection
    """
    return [remove_stop_words(doc) for doc in collection]


In [10]:
collection_without_stop_words = remove_stop_words_from_collection(deepcopy(collection))
language_model_without_stop_words = create_language_model_from_collection(collection_without_stop_words)

In [11]:
def preform_case_foldings(collection: List[str]) -> List[str]:
    """
    preform case folding on the collection
    """
    return [do_case_folding (doc) for doc in collection]

In [12]:
collection_case_folded = preform_case_foldings(deepcopy(language_model_without_stop_words))
language_model_case_folded = create_language_model_from_collection(collection_case_folded)

In [13]:
#stemming stage
def preform_stemming(collection: List[str]) -> List[str]:
    return [preform_stemming_using_porter(doc) for doc in collection]

In [14]:
collection_stemmed = preform_stemming(deepcopy(collection_case_folded))
language_model_stemmed = create_language_model_from_collection(collection_stemmed)

In [15]:
print(len(basic_language_model),len(language_model_without_stop_words),len(language_model_case_folded),len(language_model_stemmed))

41121 41021 32937 28668


In [16]:
f"change after case folding: {len(language_model_without_stop_words)/len(language_model_case_folded)-1}"

'change after case folding: 0.24543826092236687'

In [17]:
len(language_model_stemmed)/len(basic_language_model)

0.6971620339972276

In [18]:
def probability_of_word_from_the_language_models(word: str):
    """
    get probability of a word from the language models
    """
    if word in basic_language_model:
        print(f"word: {word} is in basic language model and probability is: {basic_language_model[word]:.7f}")
    if word in language_model_without_stop_words:
        print(f"word: {word} is in language model without stop words and probability is: {language_model_without_stop_words[word]:.7f}")
    word_case_folded = do_case_folding(word)
    if word_case_folded in language_model_case_folded:
        print(f"word: {word_case_folded} is in language model case folded and probability is: {language_model_case_folded[word_case_folded]:.7f}")
    word_after_stemming = preform_stemming_using_porter(word_case_folded)
    if word_after_stemming in language_model_stemmed:
        print(f"word: {word_after_stemming} is in language model stemmed and probability is: {language_model_stemmed[word_after_stemming]:.7f}")
    

In [19]:
probability_of_word_from_the_language_models("student")

word: student is in basic language model and probability is: 0.0000285
word: student is in language model without stop words and probability is: 0.0000425
word: student is in language model case folded and probability is: 0.0000488
word: student is in language model stemmed and probability is: 0.0001219


In [30]:
probability_of_word_from_the_language_models("be")

word: be is in basic language model and probability is: 0.0039269
word: be is in language model without stop words and probability is: 0.0009309
word: be is in language model case folded and probability is: 0.0001219
word: be is in language model stemmed and probability is: 0.0002682


In [26]:
probability_of_word_from_the_language_models("else")

word: else is in basic language model and probability is: 0.0000085
word: else is in language model case folded and probability is: 0.0000244
word: els is in language model stemmed and probability is: 0.0000244


In [25]:
set(basic_language_model.keys())-set(language_model_without_stop_words.keys())

{'accordance',
 'accordingly',
 'affecting',
 'ago',
 'ahead',
 'am',
 'anybody',
 'anymore',
 'anyone',
 'apart',
 'apparently',
 'arise',
 'backward',
 'begin',
 'begins',
 'brief',
 "couldn't",
 'date',
 'despite',
 "didn't",
 "don't",
 'downwards',
 'eleven',
 'else',
 'ending',
 'everyone',
 'except',
 'fewer',
 'fifteen',
 'fifty',
 'fire',
 'fix',
 'follows',
 'forever',
 'forth',
 'forward',
 'gets',
 'getting',
 'gone',
 'happens',
 "haven't",
 "he's",
 'herself',
 'hid',
 'him',
 'home',
 'hopefully',
 'hundred',
 'ignored',
 'kept',
 'knows',
 'liked',
 'maybe',
 'million',
 'mine',
 'minus',
 'nd',
 'nevertheless',
 'ninety',
 'nobody',
 'non',
 'nonetheless',
 'normally',
 'obviously',
 'okay',
 'ourselves',
 'page',
 'please',
 'plus',
 'poorly',
 'quickly',
 'respectively',
 'resulted',
 'round',
 'saying',
 'says',
 'seeming',
 'seriously',
 'sixty',
 'somehow',
 'soon',
 'specifying',
 'sure',
 "that's",
 'thin',
 'truly',
 'twelve',
 'twenty',
 'twice',
 'unfortunatel

In [31]:
collection_without_stop_words_after_all_stages = remove_stop_words_from_collection(collection_stemmed)
language_model_without_stop_words_after_all_stages = create_language_model_from_collection(collection_without_stop_words_after_all_stages)

In [32]:
print(len(basic_language_model),len(language_model_without_stop_words),len(language_model_case_folded),len(language_model_stemmed),len(language_model_without_stop_words_after_all_stages))

41121 41021 32937 28668 28313


In [36]:
language_model_without_stop_words_after_all_stages['be']

KeyError: 'be'