## Persiapan Data

In [None]:
# Import Library Standard
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Import Library Sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB

# Import Library Sklearn (Evaluasi Tak Berperingkat)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score


# Import Library Sklearn (Evaluasi Berperingkat)
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# Import Library untuk Stemming
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

Collecting Sastrawi
[?25l  Downloading https://files.pythonhosted.org/packages/6f/4b/bab676953da3103003730b8fcdfadbdd20f333d4add10af949dd5c51e6ed/Sastrawi-1.0.1-py2.py3-none-any.whl (209kB)
[K    100% |████████████████████████████████| 215kB 24.1MB/s 
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
# Read Dataset
data = pd.read_excel('dataset1.xlsx')
# data = pd.read_csv('dataset2.csv')
data.head()

Unnamed: 0,Topic,Title,Content
0,Politik,"Pelanggaran Pemilu, Tiga Caleg di Sulteng Dipr...","Komisioner Bawaslu Sigi, Sulawesi Tengah, Agus..."
1,Politik,"Pemilu Susulan di Kota Jayapura, Suara Jokowi ...",Walaupun dua dari lima distrik melakukan pemil...
2,Politik,"Tsamara Amany Dipinang, Pengurus PSI Daerah Me...","Tsamara Amany, politisi Partai Solidaritas Ind..."
3,Politik,Ada 47 TPS di Sulawesi Utara Berpotensi Pemili...,Badan Pengawas Pemilu (Bawaslu) Provinsi Sulaw...
4,Politik,Ketua KPPS di Sleman Ditemukan Tewas Gantung D...,"Tugiman, Ketua Kelompok Penyelenggara Pemungut..."


In [None]:
# Label Mapping
# data['Topic'] = data['Topic'].map({'Politik':0, 'Teknologi':1, 'Travel':2})
# data.head()

In [None]:
# Ukuran Dataset
print('Ukuran dataset:', data.shape)

Ukuran dataset: (60, 3)


In [None]:
# Pembagian data training & testing
x_train, x_test, y_train, y_test = train_test_split(data['Content'], data['Topic'], train_size=0.5, test_size=0.16)

In [None]:
train_data = pd.DataFrame({'teks':x_train, 'label':y_train})
test_data = pd.DataFrame({'teks':x_test, 'label':y_test})

In [None]:
train_data.head()

Unnamed: 0,teks,label
28,Masyarakat Indonesia menggunakan hak pilihnya ...,Teknologi
16,Pemungutan suara Pemilu 2019 telah usai. Tapi ...,Politik
47,Pemerintah Kota Banjarmasin baru saja menyeles...,Travel
18,Isu kecurangan di Pemilu 2019 terus menyeruak....,Politik
49,Bila biasanya mesin fotokopi terdapat di kios-...,Travel


In [None]:
test_data.head()

Unnamed: 0,teks,label
30,Kemajuan teknologi memudahkan dan mengubah keb...,Teknologi
38,Kamu mungkin pernah merasa kesulitan untuk ber...,Teknologi
14,KPU memberikan klarifikasi mengenai banyaknya ...,Politik
53,Untuk mendukung pengembangan pariwisata Sulawe...,Travel
0,"Komisioner Bawaslu Sigi, Sulawesi Tengah, Agus...",Politik


In [None]:
# Ukuran Data Training & Testing
print('Ukuran data train:', train_data.shape)
print('Ukuran data test:', test_data.shape)
n_train = train_data.shape[0]
n_test = test_data.shape[0]

Ukuran data train: (30, 2)
Ukuran data test: (10, 2)


In [None]:
sparse_data = pd.concat([train_data, test_data], ignore_index=True)
sparse_data.head()

Unnamed: 0,teks,label
0,Masyarakat Indonesia menggunakan hak pilihnya ...,Teknologi
1,Pemungutan suara Pemilu 2019 telah usai. Tapi ...,Politik
2,Pemerintah Kota Banjarmasin baru saja menyeles...,Travel
3,Isu kecurangan di Pemilu 2019 terus menyeruak....,Politik
4,Bila biasanya mesin fotokopi terdapat di kios-...,Travel


In [None]:
# Ukuran Sparse Data
print('Ukuran sparse data:', sparse_data.shape)
n_document = sparse_data.shape[0]

Ukuran sparse data: (40, 2)


## Stemming Dengan Stopword

### Preprocessing

**Stemming**

In [None]:
# Create stemmer
stemmerFactory = StemmerFactory()
stemmer = stemmerFactory.create_stemmer()

In [None]:
# Stem Process
for row in range(n_document):
  sparse_data.loc[row, 'teks'] = stemmer.stem(sparse_data.loc[row, 'teks'])

In [None]:
sparse_data.head()

Unnamed: 0,teks,label
0,masyarakat indonesia guna hak pilih dalam milu...,Teknologi
1,mungut suara milu 2019 telah usai tapi duka ma...,Politik
2,perintah kota banjarmasin baru saja selesai de...,Travel
3,isu curang di milu 2019 terus seruak dua timse...,Politik
4,bila biasa mesin fotokopi dapat di kios yang t...,Travel


### Perhitungan Bobot

In [None]:
vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(sparse_data['teks'])
print('Jumlah dokumen:', tf.shape[0])
print('Jumlah term:', tf.shape[1])

Jumlah dokumen: 40
Jumlah term: 2284


In [None]:
print('Daftar Term:')
vectorizer.get_feature_names()

Daftar Term:


['00',
 '000',
 '02',
 '03',
 '030',
 '04',
 '08',
 '09',
 '10',
 '100',
 '105',
 '11',
 '113',
 '12',
 '120',
 '13',
 '14',
 '144',
 '15',
 '150',
 '16',
 '17',
 '1723',
 '18',
 '19',
 '191',
 '1958',
 '1997',
 '20',
 '200',
 '2004',
 '2005',
 '2007',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '21',
 '210',
 '212',
 '218',
 '22',
 '23',
 '237',
 '24',
 '241',
 '247',
 '25',
 '250',
 '26',
 '28',
 '29',
 '30',
 '300',
 '31',
 '330',
 '34',
 '35',
 '36',
 '360',
 '366',
 '37',
 '38',
 '39',
 '3d',
 '40',
 '400',
 '420',
 '43',
 '44',
 '45',
 '46',
 '47',
 '50',
 '500',
 '51',
 '52',
 '53',
 '546',
 '55',
 '56',
 '562',
 '57',
 '579',
 '585',
 '59',
 '60',
 '600',
 '63',
 '630',
 '64',
 '640',
 '65',
 '650',
 '6703',
 '6s',
 '70',
 '75',
 '750',
 '79',
 '80',
 '800an',
 '810',
 '82',
 '835',
 '84',
 '85',
 '87',
 '870',
 '883',
 '893',
 '90',
 '94',
 '97',
 'abad',
 'abadi',
 'abdul',
 'academy',
 'acara',
 'action',
 'ada',
 'adalah',
 'adapun',
 'adik',
 '

In [None]:
print('Daftar Stopword:')
vectorizer.get_stop_words()

Daftar Stopword:


In [None]:
print('Matriks Tf:')
tf_matrix = pd.DataFrame(tf.toarray(), columns=vectorizer.get_feature_names())
tf_matrix

Matriks Tf:


Unnamed: 0,00,000,02,03,030,04,08,09,10,100,...,yang,yerusalem,yesus,yogyakarta,yunani,zahid,zamih,zat,ziarah,zoetry
0,0,0,0,0,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0
5,3,0,1,0,0,0,0,0,0,0,...,7,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,10,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,12,1,5,0,0,0,0,0,5,0
9,0,0,0,0,0,0,0,0,2,0,...,4,0,0,0,0,0,0,0,0,1


In [None]:
print('Matriks Tf (khusus data train):')
tf_train = tf_matrix[:n_train]
tf_train.shape

Matriks Tf (khusus data train):


(30, 2284)

In [None]:
transformer = TfidfTransformer(sublinear_tf=True)

# Penyesuaian df agar query (data test) tidak dihitung pada perhitungan df
n = n_train
df = tf_train.astype(bool).sum(axis=0)
idf = np.log(n/df)
transformer.idf_ = idf

weight = transformer.fit_transform(tf)
print('Jumlah dokumen:', weight.shape[0])
print('Jumlah term:', weight.shape[1])

Jumlah dokumen: 40
Jumlah term: 2284


In [None]:
weight_matrix = pd.DataFrame(weight.toarray(), columns=vectorizer.get_feature_names())
weight_matrix

Unnamed: 0,00,000,02,03,030,04,08,09,10,100,...,yang,yerusalem,yesus,yogyakarta,yunani,zahid,zamih,zat,ziarah,zoetry
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.069377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044423,0.0,0.0,0.0,0.0,0.074844,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.059516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.060025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.105043,0.0,0.05829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.047502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.050314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.056157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.057495,0.06633,0.173084,0.0,0.0,0.0,0.0,0.0,0.173084,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10358,0.0,...,0.049963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084178


In [None]:
# Pembagian matriks bobot
weight_train = weight_matrix[:n_train]
weight_test = weight_matrix[n_train:]

### Perhitungan Cosine Similarity

In [None]:
# Perhitungan cosine similarity
cosim = cosine_similarity(weight_train, weight_test)
print('Ukuran matriks cosine similarity: ', cosim.shape)

Ukuran matriks cosine similarity:  (30, 10)


In [None]:
name = []
for i in range(n_test):
  name.append('Dokumen ' + str(i))

In [None]:
# Baris = dokumen train, Kolom = dokumen test
print('Matriks Cosine Similarity:')
cosim_matrix = pd.DataFrame(cosim, columns=name)
cosim_matrix.shape

Matriks Cosine Similarity:


(30, 10)

In [None]:
# Cosim Matrix + Label
cosim_matrix['Label Train'] = train_data['label'].values
label_row = dict(zip(name, test_data['label'].values))
label_cosim = cosim_matrix.append(label_row, ignore_index=True)
label_cosim.rename({label_cosim.index[-1]:'Label Test'}, inplace=True)

label_test = pd.DataFrame(label_cosim.iloc[-1])
label_test = label_test.T
label_test

Unnamed: 0,Dokumen 0,Dokumen 1,Dokumen 2,Dokumen 3,Dokumen 4,Dokumen 5,Dokumen 6,Dokumen 7,Dokumen 8,Dokumen 9,Label Train
Label Test,Teknologi,Teknologi,Politik,Travel,Politik,Teknologi,Travel,Teknologi,Travel,Travel,


In [None]:
label_cosim

Unnamed: 0,Dokumen 0,Dokumen 1,Dokumen 2,Dokumen 3,Dokumen 4,Dokumen 5,Dokumen 6,Dokumen 7,Dokumen 8,Dokumen 9,Label Train
0,0.166523,0.125987,0.251644,0.0973246,0.121499,0.0881351,0.0810806,0.0875771,0.0817409,0.0599243,Teknologi
1,0.0760685,0.0858697,0.182546,0.0869399,0.0981643,0.0543936,0.0583188,0.0451769,0.0574865,0.0425029,Politik
2,0.0609758,0.0776311,0.0438048,0.12774,0.0444317,0.0744816,0.113667,0.0407356,0.0677027,0.062158,Travel
3,0.078341,0.094299,0.182418,0.0914816,0.0684309,0.0776131,0.0673495,0.055803,0.0637513,0.0543963,Politik
4,0.0764908,0.10425,0.0696865,0.0663873,0.0475658,0.0906005,0.0602488,0.0664026,0.0703064,0.0800367,Travel
5,0.127565,0.16722,0.143201,0.102751,0.0667467,0.105445,0.118526,0.114682,0.111002,0.076542,Politik
6,0.113296,0.120224,0.0728179,0.085999,0.0408695,0.0878792,0.0588184,0.184536,0.136251,0.0860654,Teknologi
7,0.156922,0.141301,0.137956,0.13048,0.0613107,0.0992285,0.088209,0.155896,0.120633,0.0854708,Teknologi
8,0.0969691,0.118859,0.0836936,0.109936,0.0748589,0.0980873,0.0873071,0.0888245,0.0959088,0.130477,Travel
9,0.0879894,0.125724,0.089263,0.108286,0.0772156,0.0829777,0.115987,0.08931,0.146899,0.145044,Travel


### Evaluasi

In [None]:
def average_precision(cosim_matrix, n_retrieve):
  average_precision = []

  # Loop untuk mengambil setiap query di cosim matrix
  for column in cosim_matrix.iloc[:, :-1]:

    # Sort and get top n
    sorted_cosim = cosim_matrix.sort_values(column, ascending=False)
    top_n = sorted_cosim.iloc[:n_retrieve]
#     print(top_n)

    # List of relevance
    relevant = (np.array(top_n['Label Train']) == np.array(label_test[column]))
#     print(relevant)

    # List of precision
    precision = []
    peringkat = 0
    counter_relevant = 0

    for r in relevant:
      peringkat += 1
      if r == True:
        counter_relevant += 1
        precision.append(counter_relevant / peringkat)

    # List of average precision
    average_precision.append(np.mean(precision))

  return average_precision

def mean_average_precision(cosim_matrix, n_retrieve):
  ap = average_precision(cosim_matrix, n_retrieve)
  map = np.mean(ap)
  return map

def precision_at_k(cosim_matrix, k_retrieve):
  precision_at_k = []

  # Loop untuk mengambil setiap query di cosim matrix
  for column in cosim_matrix.iloc[:, :-1]:

    # Sort and get top n
    sorted_cosim = cosim_matrix.sort_values(column, ascending=False)
    top_n = sorted_cosim.iloc[:k_retrieve]
#     print(top_n)

    # List of relevance
    relevant = (np.array(top_n['Label Train']) == np.array(label_test[column]))
#     print(relevant)

    # List of precision
    precision = np.sum(relevant) / len(relevant)
    precision_at_k.append(precision)

  return precision_at_k

def r_precision(cosim_matrix, n_retrieve):
  r_precision = []

  # Loop untuk mengambil setiap query di cosim matrix
  for column in cosim_matrix.iloc[:, :-1]:

    # Sort and get top n
    sorted_cosim = cosim_matrix.sort_values(column, ascending=False)
    top_n = sorted_cosim.iloc[:n_retrieve]
#     print(top_n)

    # List of relevance
    relevant = (np.array(top_n['Label Train']) == np.array(label_test[column]))
#     print(relevant)

    # List of precision
    precision = np.sum(relevant) / len(relevant)
    r_precision.append(precision)

  return r_precision

In [None]:
print('Hasil Evaluasi MAP: ', mean_average_precision(cosim_matrix, 10))
print('Hasil Evaluasi Precision@K: ', precision_at_k(cosim_matrix, 10))
print('Hasil Evaluasi R-Precision: ', r_precision(cosim_matrix, 10))

Hasil Evaluasi MAP:  0.7668191609977324
Hasil Evaluasi Precision@K:  [0.7, 0.6, 0.6, 0.5, 0.5, 0.5, 0.5, 0.7, 0.5, 0.7]
Hasil Evaluasi R-Precision:  [0.7, 0.6, 0.6, 0.5, 0.5, 0.5, 0.5, 0.7, 0.5, 0.7]
