✅ Montar o Google Drive no Colab

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
save_path = '/content/drive/MyDrive/Mestrado_PPGI/2AS2_IA_GENERATIVA/Aula 09/git/pyhton'

1️⃣ Definindo vocabulário, documentos e consulta

In [3]:
# Vocabulário e percentuais de documentos (para IDF)
vocab = ['computer', 'software', 'bugs', 'code', 'developer', 'programmers']
percent_docs = {'computer':0.10, 'software':0.10, 'bugs':0.05,
                'code':0.02, 'developer':0.02, 'programmers':0.02}

# Documentos
docs = {
    'D1': 'programmers write computer software code',
    'D2': 'most software has bug but good software has less bus than bad spftware',
    'D3': 'some bugs can be found only by executing the software not by examming the source code'
}

# Consulta
query = 'computer software programmers'


2️⃣ Pré-processamento: normalização e correção de termos

In [4]:
# Função para corrigir os termos conforme o enunciado
def preprocess(text):
    text = text.lower()
    # Correções ortográficas
    text = text.replace('sotware','software').replace('spftware','software')
    text = text.replace('bug','bugs').replace('bus','bugs')
    # separar palavras
    words = text.split()
    return words

# Pré-processamento dos documentos
docs_words = {doc: preprocess(text) for doc, text in docs.items()}
query_words = preprocess(query)

print(docs_words)
print(query_words)

{'D1': ['programmers', 'write', 'computer', 'software', 'code'], 'D2': ['most', 'software', 'has', 'bugs', 'but', 'good', 'software', 'has', 'less', 'bugs', 'than', 'bad', 'software'], 'D3': ['some', 'bugss', 'can', 'be', 'found', 'only', 'by', 'executing', 'the', 'software', 'not', 'by', 'examming', 'the', 'source', 'code']}
['computer', 'software', 'programmers']


3️⃣ Cálculo TF (frequência de termos) para cada documento

In [5]:
import numpy as np
import pandas as pd

# Inicializando TF
TF = pd.DataFrame(0, index=vocab, columns=docs.keys())

for doc, words in docs_words.items():
    for term in vocab:
        TF.loc[term, doc] = words.count(term)

TF

Unnamed: 0,D1,D2,D3
computer,1,0,0
software,1,3,1
bugs,0,2,0
code,1,0,1
developer,0,0,0
programmers,1,0,0


4️⃣ Normalização TF (1 + log10(tf)), TF=0 → 0

In [6]:
def normalize_tf(tf):
    if tf > 0:
        return 1 + np.log10(tf)
    else:
        return 0

TFn = TF.applymap(normalize_tf)
TFn

  TFn = TF.applymap(normalize_tf)


Unnamed: 0,D1,D2,D3
computer,1.0,0.0,0.0
software,1.0,1.477121,1.0
bugs,0.0,1.30103,0.0
code,1.0,0.0,1.0
developer,0.0,0.0,0.0
programmers,1.0,0.0,0.0


5️⃣ Cálculo do IDF (base 10)

In [7]:
IDF = {term: np.log10(1/prob) for term, prob in percent_docs.items()}
IDF_series = pd.Series(IDF)
IDF_series

Unnamed: 0,0
computer,1.0
software,1.0
bugs,1.30103
code,1.69897
developer,1.69897
programmers,1.69897


6️⃣ TF-IDF = TF normalizado × IDF

In [8]:
TFIDF = TFn.mul(IDF_series, axis=0)
TFIDF

Unnamed: 0,D1,D2,D3
computer,1.0,0.0,0.0
software,1.0,1.477121,1.0
bugs,0.0,1.692679,0.0
code,1.69897,0.0,1.69897
developer,0.0,0.0,0.0
programmers,1.69897,0.0,0.0


7️⃣ Vetor da consulta (binário)

In [9]:
q_vector = np.array([1 if term in query_words else 0 for term in vocab])
q_vector

array([1, 1, 0, 0, 0, 1])

8️⃣ Produto escalar e norma

In [12]:
# Produto escalar (Q · D) e normas
dot_products = {}
norms_docs = {}
norm_q = np.linalg.norm(q_vector)

for doc in docs.keys():
    doc_vector = TFIDF[doc].values
    dot_products[doc] = np.dot(q_vector, doc_vector)
    norms_docs[doc] = np.linalg.norm(doc_vector)

dot_products, norms_docs, norm_q

({'D1': np.float64(3.6989700043360187),
  'D2': np.float64(1.4771212547196624),
  'D3': np.float64(1.0)},
 {'D1': np.float64(2.788009711472875),
  'D2': np.float64(2.2465639465989655),
  'D3': np.float64(1.9714205729964196)},
 np.float64(1.7320508075688772))

9️⃣ Similaridade Cosseno

In [14]:
cosine_sim = {doc: round(dot_products[doc]/(norm_q*norms_docs[doc]), 5) for doc in docs.keys()}
cosine_sim_sorted = dict(sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True))

cosine_sim_sorted

{'D1': np.float64(0.76599),
 'D2': np.float64(0.37961),
 'D3': np.float64(0.29286)}