 <h2> <center> <B> TP : Text Minning </B></center></h2>
<ul>
<li> Hassan Ait Baha </li>
<li> Moad Boulahdoud </li>
</ul>

In [63]:
import math
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

In [46]:
text = """
You are trying to code TF-IDF all by yourself like a big girl/boy.
So this is a tinny doc.
And another tinny doc to test few stuff.
So in total, we are four documents, have fun ;).
"""

In [47]:
sentences = nltk.sent_tokenize(text)
total_documents = len(sentences)
total_documents

4

In [48]:
sentences

['\nYou are trying to code TF-IDF all by yourself like a big girl/boy.',
 'So this is a tinny doc.',
 'And another tinny doc to test few stuff.',
 'So in total, we are four documents, have fun ;).']

In [49]:
def _frequency_table(text_string) -> dict:
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable
_frequency_table(text)

{'tri': 1,
 'code': 1,
 'tf-idf': 1,
 'like': 1,
 'big': 1,
 'girl/boy': 1,
 '.': 4,
 'So': 2,
 'thi': 1,
 'tinni': 2,
 'doc': 2,
 'anoth': 1,
 'test': 1,
 'stuff': 1,
 'total': 1,
 ',': 2,
 'four': 1,
 'document': 1,
 'fun': 1,
 ';': 1,
 ')': 1}

In [50]:
def _frequency_matrix(words):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [51]:
freq_matrix = _frequency_matrix(sentences)
freq_matrix

{'\nYou are trying': {'tri': 1,
  'code': 1,
  'tf-idf': 1,
  'like': 1,
  'big': 1,
  'girl/boy': 1,
  '.': 1},
 'So this is a ti': {'thi': 1, 'tinni': 1, 'doc': 1, '.': 1},
 'And another tin': {'anoth': 1,
  'tinni': 1,
  'doc': 1,
  'test': 1,
  'stuff': 1,
  '.': 1},
 'So in total, we': {'total': 1,
  ',': 2,
  'four': 1,
  'document': 1,
  'fun': 1,
  ';': 1,
  ')': 1,
  '.': 1}}

In [52]:
def _tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [53]:
tf = _tf_matrix(freq_matrix)
tf

{'\nYou are trying': {'tri': 0.14285714285714285,
  'code': 0.14285714285714285,
  'tf-idf': 0.14285714285714285,
  'like': 0.14285714285714285,
  'big': 0.14285714285714285,
  'girl/boy': 0.14285714285714285,
  '.': 0.14285714285714285},
 'So this is a ti': {'thi': 0.25, 'tinni': 0.25, 'doc': 0.25, '.': 0.25},
 'And another tin': {'anoth': 0.16666666666666666,
  'tinni': 0.16666666666666666,
  'doc': 0.16666666666666666,
  'test': 0.16666666666666666,
  'stuff': 0.16666666666666666,
  '.': 0.16666666666666666},
 'So in total, we': {'total': 0.125,
  ',': 0.25,
  'four': 0.125,
  'document': 0.125,
  'fun': 0.125,
  ';': 0.125,
  ')': 0.125,
  '.': 0.125}}

In [54]:
def _documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [55]:
count_doc_per_words = _documents_per_words(freq_matrix)
count_doc_per_words

{'tri': 1,
 'code': 1,
 'tf-idf': 1,
 'like': 1,
 'big': 1,
 'girl/boy': 1,
 '.': 4,
 'thi': 1,
 'tinni': 2,
 'doc': 2,
 'anoth': 1,
 'test': 1,
 'stuff': 1,
 'total': 1,
 ',': 1,
 'four': 1,
 'document': 1,
 'fun': 1,
 ';': 1,
 ')': 1}

In [56]:
def _idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [57]:
idf = _idf_matrix(freq_matrix, count_doc_per_words, total_documents)
idf

{'\nYou are trying': {'tri': 0.6020599913279624,
  'code': 0.6020599913279624,
  'tf-idf': 0.6020599913279624,
  'like': 0.6020599913279624,
  'big': 0.6020599913279624,
  'girl/boy': 0.6020599913279624,
  '.': 0.0},
 'So this is a ti': {'thi': 0.6020599913279624,
  'tinni': 0.3010299956639812,
  'doc': 0.3010299956639812,
  '.': 0.0},
 'And another tin': {'anoth': 0.6020599913279624,
  'tinni': 0.3010299956639812,
  'doc': 0.3010299956639812,
  'test': 0.6020599913279624,
  'stuff': 0.6020599913279624,
  '.': 0.0},
 'So in total, we': {'total': 0.6020599913279624,
  ',': 0.6020599913279624,
  'four': 0.6020599913279624,
  'document': 0.6020599913279624,
  'fun': 0.6020599913279624,
  ';': 0.6020599913279624,
  ')': 0.6020599913279624,
  '.': 0.0}}

In [58]:
def _tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(), f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [61]:
tf_idf_matrix = _tf_idf_matrix(tf, idf)
tf_idf_matrix

{'\nYou are trying': {'tri': 0.08600857018970891,
  'code': 0.08600857018970891,
  'tf-idf': 0.08600857018970891,
  'like': 0.08600857018970891,
  'big': 0.08600857018970891,
  'girl/boy': 0.08600857018970891,
  '.': 0.0},
 'So this is a ti': {'thi': 0.1505149978319906,
  'tinni': 0.0752574989159953,
  'doc': 0.0752574989159953,
  '.': 0.0},
 'And another tin': {'anoth': 0.10034333188799373,
  'tinni': 0.050171665943996864,
  'doc': 0.050171665943996864,
  'test': 0.10034333188799373,
  'stuff': 0.10034333188799373,
  '.': 0.0},
 'So in total, we': {'total': 0.0752574989159953,
  ',': 0.1505149978319906,
  'four': 0.0752574989159953,
  'document': 0.0752574989159953,
  'fun': 0.0752574989159953,
  ';': 0.0752574989159953,
  ')': 0.0752574989159953,
  '.': 0.0}}

Les valeurs pour le mot "Boy" <br>
TF=0.14
IDF=0.60
TF-IDF=0.08

In [62]:
def _term_document(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue
_term_document(tf_idf_matrix)

{'\nYou are trying': 0.07372163159117906,
 'So this is a ti': 0.0752574989159953,
 'And another tin': 0.06689555459199582,
 'So in total, we': 0.0752574989159953}

In [71]:
vectorizer = TfidfVectorizer()
tfidf_vectorizer_vectors = vectorizer.fit_transform(sentences)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0] 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=vectorizer.get_feature_names()) 
df

Unnamed: 0,0
all,0.274792
and,0.0
another,0.0
are,0.216649
big,0.274792
boy,0.274792
by,0.274792
code,0.274792
doc,0.0
documents,0.0


In [1]:
##q8

<h3> import data </h3>

In [4]:
import os
import nltk
import nltk.corpus
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


True

In [5]:
document = []
d1 = nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')
d2 = nltk.corpus.gutenberg.raw('shakespeare-hamlet.txt')
d3 = nltk.corpus.gutenberg.raw('shakespeare-macbeth.txt')

document.append(d1)
document.append(d2)
document.append(d3)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
#Creation et affichage du DTM 
v = vectorizer.fit_transform(document)

In [10]:
print(v.shape)
print(v)

(3, 51015)
  (0, 22043)	0.001043795799257597
  (0, 29582)	0.001043795799257597
  (0, 13747)	0.0006164826137042619
  (0, 30075)	0.0007938336096451947
  (0, 12674)	0.0007938336096451947
  (0, 9635)	0.0007938336096451947
  (0, 16965)	0.001043795799257597
  (0, 43126)	0.001043795799257597
  (0, 15529)	0.001043795799257597
  (0, 41038)	0.001043795799257597
  (0, 3389)	0.001043795799257597
  (0, 23249)	0.0007938336096451947
  (0, 34198)	0.001043795799257597
  (0, 44636)	0.001043795799257597
  (0, 13595)	0.001043795799257597
  (0, 37383)	0.0007938336096451947
  (0, 19615)	0.001043795799257597
  (0, 30614)	0.001043795799257597
  (0, 37979)	0.001043795799257597
  (0, 23596)	0.001043795799257597
  (0, 24401)	0.001043795799257597
  (0, 36203)	0.001043795799257597
  (0, 28202)	0.001043795799257597
  (0, 5192)	0.0007938336096451947
  (0, 18967)	0.0007938336096451947
  :	:
  (2, 28642)	0.12014092945824126
  (2, 22484)	0.025484439582051176
  (2, 47739)	0.08591896773377253
  (2, 42980)	0.0757251919009