# MODUL 4: TERM WEIGHTING, VECTOR SPACE MODEL, DAN UKURAN KEMIRIPAN TEKS


## A. Teks Weighting

In [1]:
# Inverted Index 
doc1_term = ["pengembangan", "sistem", "informasi", "penjadwalan"]
doc2_term = ["pengembangan", "model", "analisis", "sentimen", "berita"]
doc3_term = ["pengembangan", "analisis", "sistem", "input", "output"]

corpus_term = [doc1_term, doc2_term, doc3_term ]

corpus_term

[['pengembangan', 'sistem', 'informasi', 'penjadwalan'],
 ['pengembangan', 'model', 'analisis', 'sentimen', 'berita'],
 ['pengembangan', 'analisis', 'sistem', 'input', 'output']]

In [2]:
inverted_index = {}



def stemming(text):
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    output = stemmer.stem(text)
    return(output)

for i in range(len(corpus_term)):
    for item in corpus_term[i]:
        item = stemming(item)
        if item not in inverted_index: # inputing item in dictionary if not available
            inverted_index[item] = []
        if (item in inverted_index) and ((i+1) not in inverted_index[item]): # setelah ada di key dictionary, masukin indeksnya ke berapa aja
            inverted_index[item].append(i+1)
print(inverted_index)
        


{'kembang': [1, 2, 3], 'sistem': [1, 3], 'informasi': [1], 'jadwal': [1], 'model': [2], 'analisis': [2, 3], 'sentimen': [2], 'berita': [2], 'input': [3], 'output': [3]}


In [3]:
# Count raw term 
def termFrequencyInDoc(vocab, doc_dict):
    tf_docs = {}
    for doc_id  in doc_dict.keys():
        tf_docs[doc_id] = {}
    for word in vocab:
        for doc_id, doc in doc_dict.items():
            tf_docs[doc_id][word] = doc.count(word)
    return (tf_docs)
   

In [4]:
vocab = list(inverted_index.keys()) # ini bisa ambil di pertemuan sebelumnya
doc_dict = {}

# clean after stemming 
doc_dict['doc1'] = "kembang sistem informasi jadwal"
doc_dict['doc2'] = "kembang model analisis sentimen berita"
doc_dict['doc3'] = "analisis sistem input output"


print(termFrequencyInDoc(vocab, doc_dict))

{'doc1': {'kembang': 1, 'sistem': 1, 'informasi': 1, 'jadwal': 1, 'model': 0, 'analisis': 0, 'sentimen': 0, 'berita': 0, 'input': 0, 'output': 0}, 'doc2': {'kembang': 1, 'sistem': 0, 'informasi': 0, 'jadwal': 0, 'model': 1, 'analisis': 1, 'sentimen': 1, 'berita': 1, 'input': 0, 'output': 0}, 'doc3': {'kembang': 0, 'sistem': 1, 'informasi': 0, 'jadwal': 0, 'model': 0, 'analisis': 1, 'sentimen': 0, 'berita': 0, 'input': 1, 'output': 1}}


In [5]:
def tokenn(doc):
    token = doc.split(" ")
    return(token)

In [6]:
def wordDocFre(vocab, doc_dict):
    df = {}
    for word in vocab:
        frq = 0
        for doc in doc_dict.values():
            if word in tokenn(doc):
                frq = frq + 1
        df[word] = frq
    return (df)
print(wordDocFre(vocab, doc_dict))

{'kembang': 2, 'sistem': 2, 'informasi': 1, 'jadwal': 1, 'model': 1, 'analisis': 2, 'sentimen': 1, 'berita': 1, 'input': 1, 'output': 1}


In [9]:
import numpy as np
def inverseDocFre(vocab, doc_fre, length):
    idf = {}
    for word in vocab:
        idf[word] =  1 + np.log10((length +1) /(doc_fre[word]+1))
    return(idf)

In [10]:
print(inverseDocFre(vocab, wordDocFre(vocab, doc_dict), len(doc_dict)))

{'kembang': 1.1249387366083, 'sistem': 1.1249387366083, 'informasi': 1.3010299956639813, 'jadwal': 1.3010299956639813, 'model': 1.3010299956639813, 'analisis': 1.1249387366083, 'sentimen': 1.3010299956639813, 'berita': 1.3010299956639813, 'input': 1.3010299956639813, 'output': 1.3010299956639813}


## B. Vector Space Model

In [18]:
def tfidf(vocab, tf, idf_scr, doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id, doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return (tf_idf_scr)

In [19]:
tf_idf = tfidf(vocab, termFrequencyInDoc(vocab, doc_dict), inverseDocFre(vocab, wordDocFre(vocab, doc_dict), len(doc_dict)), doc_dict)
print(tf_idf)

# Term - Document Matrix
TD = np.zeros((len(vocab), len(doc_dict)))
for word in vocab:
    for doc_id, doc in tf_idf.items():
        ind1 = vocab.index(word)
        ind2 = list(tf_idf.keys()).index(doc_id)
        TD[ind1][ind2] = tf_idf[doc_id][word]
print(TD)

{'doc1': {'kembang': 1.1249387366083, 'sistem': 1.1249387366083, 'informasi': 1.3010299956639813, 'jadwal': 1.3010299956639813, 'model': 0.0, 'analisis': 0.0, 'sentimen': 0.0, 'berita': 0.0, 'input': 0.0, 'output': 0.0}, 'doc2': {'kembang': 1.1249387366083, 'sistem': 0.0, 'informasi': 0.0, 'jadwal': 0.0, 'model': 1.3010299956639813, 'analisis': 1.1249387366083, 'sentimen': 1.3010299956639813, 'berita': 1.3010299956639813, 'input': 0.0, 'output': 0.0}, 'doc3': {'kembang': 0.0, 'sistem': 1.1249387366083, 'informasi': 0.0, 'jadwal': 0.0, 'model': 0.0, 'analisis': 1.1249387366083, 'sentimen': 0.0, 'berita': 0.0, 'input': 1.3010299956639813, 'output': 1.3010299956639813}}
[[1.12493874 1.12493874 0.        ]
 [1.12493874 0.         1.12493874]
 [1.30103    0.         0.        ]
 [1.30103    0.         0.        ]
 [0.         1.30103    0.        ]
 [0.         1.12493874 1.12493874]
 [0.         1.30103    0.        ]
 [0.         1.30103    0.        ]
 [0.         0.         1.30103   ]


## C. Ukuran Kemiripan Teks _(Text Similarity)_
### 1. Edit Distance

In [20]:
def edit_distance(string1, string2):
    if(len(string1) > len(string2)):
        difference = len(string1) - len(string2)
        string1[:difference]
        n = len(string2)
    elif len(string2) > len(string1):
        difference = len(string2) - len(string1)
        string2[:difference]
        n = len(string1)
    for i in range(n):
        if string1[i] != string2[i]:
            difference += 1
    return (difference)

In [21]:
print(edit_distance(doc_dict['doc1'],doc_dict['doc2']))
print(edit_distance(doc_dict['doc1'],doc_dict['doc3']))
print(edit_distance(doc_dict['doc2'],doc_dict['doc3']))

30
31
38


### 2. Jaccard Similarity

In [22]:
def jaccard_sim(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection)/union

In [23]:
print(jaccard_sim(doc_dict['doc1'].split(" "), doc_dict['doc2'].split(" ")))
print(jaccard_sim(doc_dict['doc1'].split(" "), doc_dict['doc3'].split(" ")))
print(jaccard_sim(doc_dict['doc2'].split(" "), doc_dict['doc3'].split(" ")))

0.125
0.14285714285714285
0.125


### 3. Euclidian Distance

In [24]:
def euclidian_dist(vec1, vec2):
    # substring vector
    temp = vec1 - vec2
    
    # doing dot product
    # for finding 
    # sum of the square
    sum_sq = np.dot(temp.T, temp)
    
    # Doing squareroot and
    # printing Euclidian distance
    return np.sqrt(sum_sq)

In [25]:
print(euclidian_dist(TD[:,0], TD[:, 1])) # doc1 & doc2
print(euclidian_dist(TD[:,0], TD[:, 2])) # doc1 & doc3
print(euclidian_dist(TD[:,1], TD[:, 2])) # doc3 & doc3

3.3157758624989797
3.049867295590651
3.3157758624989797


### 4. Cosine Similarity

In [52]:
import math
def cosine_sim(vec1, vec2):
#     vec1 = list(vec1)  # kenapa ini list object is not callable
#     vec2 = list(vec2) 
    dot_prod = 0
    for i,v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return(dot_prod/(mag_1*mag_2))


In [53]:
print(cosine_sim(TD[:, 0], TD[:, 1])) # doc1 and doc2

0.18861114004375698
