# Praktikum 8 Probabilistic Information Retrieval

In [3]:
# !pip install rank_bm25



Okapi BM25 atau yang biasa disebut dengan BM25 dikembangkan oleh City University London dan
berdasarkan pada model probabilistik dasar yang mengurutkan dokumen dalam urutan menurun
terhadap nilai relevansi sebuah dokumen terhadap informasi yang dibutuhkan. BM25 meranking
dokumen berdasarkan probabilitas dan menggunakan term frequency untuk meranking similarity.

### Menghitung Skor Relevansi BM25

In [59]:
from rank_bm25 import BM25Okapi

def tokenisasi(text):
    tokens = text.split(" ")
    return tokens

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
def stemming(text):
    #Create Stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    #stemming process
    output = stemmer.stem(text)
    return output
    
def stemming_sentence(text):
    output = ""
    for token in tokenisasi(text):
        output = output + stemming(token) + " "
    return output[:-1]

doc_dict_raw = {}
doc_dict_raw['doc1'] = "pengembangan sistem informasi penjadwalan"
doc_dict_raw['doc2'] = "pengembangan model analisis sentimen berita"
doc_dict_raw['doc3'] = "analisis sistem input output"
doc_dict_raw['doc4'] = "pengembangan sistem informasi akademik universitas"
doc_dict_raw['doc5'] = "pengembangan sistem cari berita ekonomi"
doc_dict_raw['doc6'] = "analisis sistem neraca nasional"
doc_dict_raw['doc7'] = "pengembangan sistem informasi layanan statistik"
doc_dict_raw['doc8'] = "pengembangan sistem pencarian skripsi di universitas"
doc_dict_raw['doc9'] = "analisis sentimen publik terhadap pemerintah"
doc_dict_raw['doc10'] = "pengembangan model klasifikasi sentimen berita"

doc_dict = {}
for doc_id, doc in doc_dict_raw.items():
    doc_dict[doc_id] = stemming_sentence(doc)

tokenized_corpus = [tokenisasi(doc_dict[doc_id]) for doc_id in doc_dict]


def querinisasi(query):
    tokenized_query = tokenisasi(query)
    bm25 = BM25Okapi(tokenized_corpus)
    doc_scores = bm25.get_scores(tokenized_query)
#     print("Query '" + query + "' :") 
#     print(doc_scores)
    return(doc_scores)



In [68]:
# print(tokenized_corpus)
print(doc_dict)

{'doc1': 'kembang sistem informasi jadwal', 'doc2': 'kembang model analisis sentimen berita', 'doc3': 'analisis sistem input output', 'doc4': 'kembang sistem informasi akademik universitas', 'doc5': 'kembang sistem cari berita ekonomi', 'doc6': 'analisis sistem neraca nasional', 'doc7': 'kembang sistem informasi layan statistik', 'doc8': 'kembang sistem cari skripsi di universitas', 'doc9': 'analisis sentimen publik hadap perintah', 'doc10': 'kembang model klasifikasi sentimen berita'}


In [30]:
# Skor Relevansi dalam Suatu Query Tertentu
querinisasi("sistem informasi statistik")
# querinisasi("sistem")
# querinisasi("sistem")

array([1.18979771, 0.        , 0.36586252, 1.08030712, 0.33219419,
       0.36586252, 2.89216154, 0.30420029, 0.        , 0.        ])

### Fungsi mengembalikan top k dokumen dengan BM25

In [53]:
from collections import OrderedDict
def exact_top_k_bm25(doc_dict, rank_score, k):
    relevance_scores = {}
    i = 0
    for doc_id in doc_dict.keys():
        relevance_scores[doc_id] = rank_score[i]
        i = i + 1
        
#         sorted_value = OrderedDict(sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True))
#         top_k = {j: sorted_value[j] for j in list(sorted_value)[:k]}
        
        sorted_value = OrderedDict(sorted(relevance_scores.items(),key = lambda x:x[1], reverse = True))
        top_k = {j : sorted_value[j] for j in list(sorted_value)[:k]}
    return top_k

In [55]:
top_3_bm25 = exact_top_k_bm25(doc_dict, querinisasi("sistem informasi statistik"),3)
print(top_3_bm25)

{'doc7': 2.892161541603838, 'doc1': 1.1897977081195452, 'doc4': 1.080307121482777}


In [24]:
exact_top_k(doc_dict, querinisasi("sistem informasi "),3)

{'doc1': 1.1897977081195452,
 'doc4': 1.080307121482777,
 'doc7': 1.080307121482777}

In [27]:
exact_top_k(doc_dict, querinisasi("sistem"),3)

{'doc1': 0.3658625167174947,
 'doc3': 0.3658625167174947,
 'doc6': 0.3658625167174947}

## Perhitungan dengan VSM

In [67]:
#### Inverted Index
vocab = []
inverted_index = {}
for doc_id, doc in doc_dict.items():
    for token in tokenisasi(doc):
        print(token)
        if token not in vocab : 
            vocab.append(token)
            inverted_index[token] = []
        if token in inverted_index:
            if doc_id not in inverted_index[token]:
                inverted_index[token].append(doc_id)
# print("\nThis is Vocab :")
# print(vocab)
# print("\nThis is inverted Index : ")
# for doc_id in inverted_index :
#     print(doc_id + " : ")
#     print(inverted_index[doc_id])
#     print("\n")
print(inverted_index)


kembang
sistem
informasi
jadwal
kembang
model
analisis
sentimen
berita
analisis
sistem
input
output
kembang
sistem
informasi
akademik
universitas
kembang
sistem
cari
berita
ekonomi
analisis
sistem
neraca
nasional
kembang
sistem
informasi
layan
statistik
kembang
sistem
cari
skripsi
di
universitas
analisis
sentimen
publik
hadap
perintah
kembang
model
klasifikasi
sentimen
berita
{'kembang': ['doc1', 'doc2', 'doc4', 'doc5', 'doc7', 'doc8', 'doc10'], 'sistem': ['doc1', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8'], 'informasi': ['doc1', 'doc4', 'doc7'], 'jadwal': ['doc1'], 'model': ['doc2', 'doc10'], 'analisis': ['doc2', 'doc3', 'doc6', 'doc9'], 'sentimen': ['doc2', 'doc9', 'doc10'], 'berita': ['doc2', 'doc5', 'doc10'], 'input': ['doc3'], 'output': ['doc3'], 'akademik': ['doc4'], 'universitas': ['doc4', 'doc8'], 'cari': ['doc5', 'doc8'], 'ekonomi': ['doc5'], 'neraca': ['doc6'], 'nasional': ['doc6'], 'layan': ['doc7'], 'statistik': ['doc7'], 'skripsi': ['doc8'], 'di': ['doc8'], 'publik': [

In [66]:
# TF 
def termFrequencyInDoc(vocab, doc_dict):
    tf_docs = {}
    for doc_id  in doc_dict.keys():
        tf_docs[doc_id] = {}
    for word in vocab:
        for doc_id, doc in doc_dict.items():
            tf_docs[doc_id][word] = doc.count(word)
    return (tf_docs)

tf = termFrequencyInDoc(vocab, doc_dict)
print(tf)

{'doc1': {'kembang': 1, 'sistem': 1, 'informasi': 1, 'jadwal': 1, 'model': 0, 'analisis': 0, 'sentimen': 0, 'berita': 0, 'input': 0, 'output': 0, 'akademik': 0, 'universitas': 0, 'cari': 0, 'ekonomi': 0, 'neraca': 0, 'nasional': 0, 'layan': 0, 'statistik': 0, 'skripsi': 0, 'di': 0, 'publik': 0, 'hadap': 0, 'perintah': 0, 'klasifikasi': 0}, 'doc2': {'kembang': 1, 'sistem': 0, 'informasi': 0, 'jadwal': 0, 'model': 1, 'analisis': 1, 'sentimen': 1, 'berita': 1, 'input': 0, 'output': 0, 'akademik': 0, 'universitas': 0, 'cari': 0, 'ekonomi': 0, 'neraca': 0, 'nasional': 0, 'layan': 0, 'statistik': 0, 'skripsi': 0, 'di': 0, 'publik': 0, 'hadap': 0, 'perintah': 0, 'klasifikasi': 0}, 'doc3': {'kembang': 0, 'sistem': 1, 'informasi': 0, 'jadwal': 0, 'model': 0, 'analisis': 1, 'sentimen': 0, 'berita': 0, 'input': 1, 'output': 1, 'akademik': 0, 'universitas': 0, 'cari': 0, 'ekonomi': 0, 'neraca': 0, 'nasional': 0, 'layan': 0, 'statistik': 0, 'skripsi': 0, 'di': 0, 'publik': 0, 'hadap': 0, 'perintah'

In [41]:
# IDF

## WordDocument Count 
def wordDocFre(vocab, doc_dict):
    df = {}
    for word in vocab:
        frq = 0
        for doc in doc_dict.values():
            if word in tokenisasi(doc):
                frq = frq + 1
        df[word] = frq
    return (df)

import numpy as np
def inverseDocFre(vocab, doc_fre, length):
    idf = {}
    for word in vocab:
        idf[word] =  1 + np.log((length +1) /(doc_fre[word]+1))
    return(idf)
idf = inverseDocFre(vocab,wordDocFre(vocab, doc_dict), len(doc_dict))
# print(idf)

In [43]:
# TF.IDF
def tfidf(vocab, tf, idf_scr, doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id, doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return (tf_idf_scr)
# print(tfidf(vocab, tf, idf, doc_dict))


In [42]:
# Term Freq 
query = "sistem informasi statistik"
# query = "sistem sentimen berita"
def termFrequency(vocab, query):
    tf_query = {}
    for word in vocab:
        tf_query[word] = query.count(word)
    return (tf_query)
tf_query = termFrequency(vocab, query)
# print(tf_query)

In [44]:
# Term - Query Matrix
TQ = np.zeros((len(vocab),1))
for word in vocab:
    ind1 = vocab.index(word)
    TQ[ind1][0] = tf_query[word] * idf[word]
# print(TQ)

In [45]:
import math
def cosine_sim(vec1, vec2):
 
    dot_prod = 0
    for i,v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return(dot_prod/(mag_1*mag_2))

In [46]:
def tfidf(vocab, tf, idf_scr, doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id, doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return (tf_idf_scr)

tf_idf = tfidf(vocab, termFrequencyInDoc(vocab, doc_dict), inverseDocFre(vocab, wordDocFre(vocab, doc_dict), len(doc_dict)), doc_dict)
# print(tf_idf)

In [48]:
# Term - Document Matrix
TD = np.zeros((len(vocab), len(doc_dict)))
for word in vocab:
    for doc_id, doc in tf_idf.items():
        ind1 = vocab.index(word)
        ind2 = list(tf_idf.keys()).index(doc_id)
        TD[ind1][ind2] = tf_idf[doc_id][word]
# print(TD)

In [56]:
# Menyimpan skor kemiripan dalam suatu list 
from collections import OrderedDict
def exact_top_k_vsm(doc_dict, TD, q,k):
    relevance_scores = {}
    i = 0 
    for doc_id in doc_dict.keys():
        relevance_scores[doc_id] = cosine_sim(q, TD[:,i])
        i  = i + 1
    
    sorted_value = OrderedDict(sorted(relevance_scores.items(),key = lambda x:x[1], reverse = True))
    top_k = {j : sorted_value[j] for j in list(sorted_value)[:k]}
    return top_k
top_3_vsm = exact_top_k_vsm(doc_dict, TD, TQ[:, 0], 3)
print(top_3_vsm)

{'doc7': 0.7689768599816609, 'doc1': 0.414904809442661, 'doc4': 0.35626622628022314}


## Perbandingan dengan VSM

query = "sistem informasi statistik"


#### BM 25 

In [57]:
exact_top_k_bm25(doc_dict, querinisasi("sistem informasi statistik"),3)

{'doc7': 2.892161541603838,
 'doc1': 1.1897977081195452,
 'doc4': 1.080307121482777}

#### VSM

In [58]:
exact_top_k_vsm(doc_dict, TD, TQ[:, 0], 3)

{'doc7': 0.7689768599816609,
 'doc1': 0.414904809442661,
 'doc4': 0.35626622628022314}