# TF-IDF notebook

In this notebook we will build a first method to find the frequency of words in a corpus and see their relevance.

We will work with the corpus of EHR (CSV files - Lounes), and then build the TF and then IDF model.

WE will then try to have visualization aspect.

In [37]:
import pandas as pd
import numpy as np

In [72]:
# Download the csv file to analyze
TEST = {1: "le tf-idf de l'anglais term frequency-inverse document frequency est une méthode de pondération souvent utilisée en recherche d'information et en particulier dans la fouille de textes",
       2: "cette mesure statistique permet d'évaluer l'importance d'un terme contenu dans un document relativement à une collection ou un corpus",
       3: "le poids augmente proportionnellement au nombre d'occurrences du mot dans le document",
       4: "il varie également en fonction de la fréquence du mot dans le corpus ainsi le",
       5: "des variantes de la formule originale sont souvent utilisées dans des moteurs de recherche pour apprécier la pertinence d'un document en fonction des critères de recherche de l'utilisateur"}
df = pd.DataFrame.from_dict(TEST, orient = 'index')
#df[1] = [element[0].split() for _, element in df.iterrows()]
all_words = set([word for _, words in df.iterrows() for word in words[0].split() for _, words in df.iterrows()])
all_words_dict = {word:0 for word in all_words}
#df

## TF aspect

In [55]:
# Finding the count in each document
def raw_count_text(doc):
    """
    Given a text, return a dictionnary with the amount of occurences of a word
    """
    words = {}
    for w in doc.split():
        if w not in words:
            words[w] = 1
        else:
            words[w]+=1
    return words
    
def term_frequency_text(doc, log = False):
    """
    Given a text, return a dictionnary with the frequency of a word within the text
    """
    words = {}
    for w in doc.split():
        if w not in words:
            words[w] = 1
        else:
            words[w] +=1
    if log == True:
        words = {k:np.log(1+v) for k,v in words.items()}
    else:
        words = {k:(v*100)/sum(words.values()) for k,v in words.items()}
    return words

def bool_frequency(doc):
    words = {}
    for w in all_words:
        if w in doc.split():
            words[w] = 1
        else:
            words[w] = 0
    return words

# Finding the count for the whole corpus
def raw_count_corpus(data):
    """
    Given a data frame composed of text, return a dictionnary with the amount of occurences of a word
    """
    words = {}
    for _, string in data.iterrows():
        for w in string[0].split():
            if w not in words:
                words[w] = 1
            else:
                words[w]+=1
    return words
    
def term_frequency_corpus(data, log = False):
    """
    Given a dataframe composed of texts, return a dictionnary with the frequency of a word within the corpus
    """
    words = {}
    for _, string in data.iterrows():
        for w in string[0].split():
            if w not in words:
                words[w] = 1
            else:
                words[w] +=1
    if log == True:
        words = {k:np.log(1+v) for k,v in words.items()}
    else:
        words = {k:(v*100)/sum(words.values()) for k,v in words.items()}
    return words

## IDF aspect

In [85]:
# Compute the IDF
def idf(data):
    N = data.shape[0]
    frequency = {}
    for _, word in data.iterrows():
        bf = bool_frequency(word[0])
        for k,v in bf.items():
            if v == 1:
                if k not in frequency:
                    frequency[k] = 1
                else:
                    frequency[k] +=1
    return frequency

In [86]:
idf(df)

{'particulier': 1,
 'le': 3,
 'term': 1,
 'méthode': 1,
 'utilisée': 1,
 'une': 2,
 'pondération': 1,
 'dans': 5,
 'fouille': 1,
 'document': 4,
 "l'anglais": 1,
 'en': 3,
 'est': 1,
 'frequency': 1,
 'textes': 1,
 'tf-idf': 1,
 'souvent': 2,
 'recherche': 2,
 'la': 3,
 "d'information": 1,
 'et': 1,
 'frequency-inverse': 1,
 'de': 3,
 'statistique': 1,
 'terme': 1,
 'ou': 1,
 'cette': 1,
 "d'évaluer": 1,
 'contenu': 1,
 'relativement': 1,
 'un': 1,
 'collection': 1,
 'à': 1,
 "d'un": 2,
 'permet': 1,
 'mesure': 1,
 "l'importance": 1,
 'corpus': 2,
 'nombre': 1,
 'mot': 2,
 'poids': 1,
 'proportionnellement': 1,
 'du': 2,
 'augmente': 1,
 'au': 1,
 "d'occurrences": 1,
 'il': 1,
 'fonction': 2,
 'ainsi': 1,
 'fréquence': 1,
 'varie': 1,
 'également': 1,
 'moteurs': 1,
 'originale': 1,
 'sont': 1,
 'critères': 1,
 'formule': 1,
 'variantes': 1,
 'pertinence': 1,
 "l'utilisateur": 1,
 'des': 1,
 'utilisées': 1,
 'apprécier': 1,
 'pour': 1}