### Import All Third Party Libraries

In [1]:
import pandas as pd
from openpyxl import load_workbook
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory  # Rumus Library
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import numpy.linalg as LA
import string
import matplotlib.pyplot as plt

stemmer = StemmerFactory().create_stemmer()  # Object stemmer
remover = StopWordRemoverFactory().create_stop_word_remover()  # objek stopword

### Define Custom Libraries

In [2]:
class Engine:
    def __init__(self):
        self.cosine_score = []
        self.train_set = []  # Documents
        self.test_set = []  # Query

    def addDocument(self, word):
        self.train_set.append(word)

    def setQuery(self, word):
        self.test_set.append(word)

    def process_score(self):
        stopWords = stopwords.words('english')
        vectorizer = CountVectorizer()

        transformer = TfidfTransformer()

        trainVectorizerArray = vectorizer.fit_transform(self.train_set).toarray()
        testVectorizerArray = vectorizer.transform(self.test_set).toarray()

        cx = lambda a, b: round(np.inner(a, b) / (LA.norm(a) * LA.norm(b)), 3)
        #         print testVectorizerArray
        output = []
        for i in range(0, len(testVectorizerArray)):
            output.append([])

        for vector in trainVectorizerArray:
            # print vector
            u = 0
            for testV in testVectorizerArray:
                # print testV
                cosine = cx(vector, testV)
                #                 self.cosine_score.append(cosine)
                #                 bulatin = (round(cosine),2)
                output[u].append((cosine))
                u = u + 1
        return output
        # return testVectorizerArray


def stemmerEN(text):
    porter = PorterStemmer()
    stop = set(stopwords.words('english'))
    text = text.lower()
    text = [i for i in text.lower().split() if i not in stop]
    text = ' '.join(text)
    preprocessed_text = text.translate(None, string.punctuation)
    text_stem = porter.stem(preprocessed_text)
    return text_stem


def preprocess(text):
    text = text.lower()
    text_clean = remover.remove(text)
    text_stem = stemmer.stem(text_clean)
    return text_stem


def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')

    # find a set of valid initial cluster medoid indices since we
    # can't seed different clusters with two points at the same location
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs, cs = np.where(D == 0)
    # the rows, cols must be shuffled because we will keep the first duplicate below
    index_shuf = list(range(len(rs)))
    wikwik = np.random.shuffle(index_shuf)
    #     print (wikwik)
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r, c in zip(rs, cs):
        # if there are two points with a distance of 0...
        # keep the first one for cluster init
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('too many medoids (after removing {} duplicate points)'.format(
            len(invalid_medoid_inds)))

    # randomly initialize an array of k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in range(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:, M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J == kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa], C[kappa])], axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:, M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J == kappa)[0]

    # return results
    return M, C

### Load Datasets

In [3]:
pd.set_option('display.max_colwidth', -1)
wb = load_workbook(filename='data-minimalis.xlsx')
dataset = pd.DataFrame(wb['Sheet1'].values)
dataset.columns = ["Judul"]
dataset

Unnamed: 0,Judul
0,Penerapan antarmuka bahasa alami dalam pencarian informasi skripsi pada suatu program studi
1,System reminder aktifitas akademik dosen
2,Perancangan Sistem Test Komputerisasi dan Pendukung Keputusan Penerimaan Pegawai
3,Pengolahan Bahasa Alami pada Agen Cerdas Alat Bantu Konsultasi Zakat
4,Perancangan dan implementasi system pengenalan jenis kulit hewan unutk kerajinan kulit berbasis system cerdas
5,Pengembangan Template Media Pembelajaran Berbasis Flash
6,Membangun system informasi eksekutif (SIE) dengan menggunakan pendekatan system cerdas
7,Perancangan visualisasi informasi untuk system evaluasi guru
8,Pengembangan Sistem Penilaian Kerja Dosen dengan Aspek IKD dan SKP 2014 Berbasis Online
9,Perancangan animasi wayang pendidikan sebagai pengembangan media edukasi sekolah


### Preprocessing Datasets

In [4]:
list_pre_judul = []
for data in dataset['Judul']:
    list_pre_judul.append(preprocess(data))
pre_judul = pd.DataFrame(list_pre_judul)
pre_judul.columns = ["Judul"]
pre_judul

Unnamed: 0,Judul
0,terap antarmuka bahasa alami cari informasi skripsi suatu program studi
1,system reminder aktifitas akademik dosen
2,ancang sistem test komputerisasi dukung putus terima pegawai
3,olah bahasa alami agen cerdas alat bantu konsultasi zakat
4,ancang implementasi system kenal jenis kulit hewan unutk rajin kulit bas system cerdas
5,kembang template media ajar bas flash
6,bangun system informasi eksekutif sie guna dekat system cerdas
7,ancang visualisasi informasi system evaluasi guru
8,kembang sistem nilai kerja dosen aspek ikd skp 2014 bas online
9,ancang animasi wayang didik kembang media edukasi sekolah


### Get Cosine Score from Title

In [5]:

# Call Engine from Custom Libraries
engine = Engine()

# Define data uji
list_dokumen = [str(x) for x in pre_judul['Judul']]
list_datauji = [str(x) for x in pre_judul['Judul']]
columnNames = []

for i, doc in enumerate(list_dokumen):
    engine.addDocument(doc)
    columnNames.append("Document_{}".format(i+1))
    
for doc in list_datauji:
    engine.setQuery(doc)
    
titles_score = engine.process_score()
titlesScoreDf = pd.DataFrame(titles_score)
titlesScoreDf.columns = columnNames
titlesScoreDf

Unnamed: 0,Document_1,Document_2,Document_3,Document_4,Document_5,Document_6,Document_7,Document_8,Document_9,Document_10,Document_11
0,1.0,0.0,0.0,0.211,0.0,0.0,0.095,0.129,0.0,0.0,0.085
1,0.0,1.0,0.0,0.0,0.217,0.0,0.27,0.183,0.135,0.0,0.0
2,0.0,0.0,1.0,0.0,0.086,0.0,0.0,0.144,0.107,0.125,0.0
3,0.211,0.0,0.0,1.0,0.081,0.0,0.101,0.0,0.0,0.0,0.0
4,0.0,0.217,0.086,0.081,1.0,0.099,0.366,0.297,0.073,0.086,0.0
5,0.0,0.0,0.0,0.0,0.099,1.0,0.0,0.0,0.246,0.289,0.0
6,0.095,0.27,0.0,0.101,0.366,0.0,1.0,0.369,0.0,0.0,0.0
7,0.129,0.183,0.144,0.0,0.297,0.0,0.369,1.0,0.0,0.144,0.0
8,0.0,0.135,0.107,0.0,0.073,0.246,0.0,0.0,1.0,0.107,0.0
9,0.0,0.0,0.125,0.0,0.086,0.289,0.0,0.144,0.107,1.0,0.0


### Testing K Medoids Methods

In [12]:
data = np.array(titles_score)
datasetResult = dataset.copy()
datasetResult['Cluster'] = 0
# Get distance matrix
D = pairwise_distances(data, metric='euclidean')

# Get medoids point
M, C = kMedoids(D, 3)

print("<Medoids:{}>".format(M))
hasil = []
for label in C:
    for point_idx in C[label]:
        datasetResult["Cluster"][point_idx] = label
#         print('label {0}:　{1}'.format(label, data[point_idx]))
#         hasil.append({'label':label, 'skor': list(data[point_idx])})
datasetResult

<Medoids:[5 6 7]>


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Judul,Cluster
0,Penerapan antarmuka bahasa alami dalam pencarian informasi skripsi pada suatu program studi,2
1,System reminder aktifitas akademik dosen,1
2,Perancangan Sistem Test Komputerisasi dan Pendukung Keputusan Penerimaan Pegawai,2
3,Pengolahan Bahasa Alami pada Agen Cerdas Alat Bantu Konsultasi Zakat,1
4,Perancangan dan implementasi system pengenalan jenis kulit hewan unutk kerajinan kulit berbasis system cerdas,1
5,Pengembangan Template Media Pembelajaran Berbasis Flash,0
6,Membangun system informasi eksekutif (SIE) dengan menggunakan pendekatan system cerdas,1
7,Perancangan visualisasi informasi untuk system evaluasi guru,2
8,Pengembangan Sistem Penilaian Kerja Dosen dengan Aspek IKD dan SKP 2014 Berbasis Online,0
9,Perancangan animasi wayang pendidikan sebagai pengembangan media edukasi sekolah,0


### Get list of titles according clusters label

In [7]:
kumpul = []
for i in range(0,len(data)):
    kumpul.append({'judul':dataset['Judul'][i],'skor':list(titles_score[i]), 'id': i})
kumpul

[{'judul': 'Penerapan antarmuka bahasa alami dalam pencarian informasi skripsi pada suatu program studi ',
  'skor': [1.0, 0.0, 0.0, 0.211, 0.0, 0.0, 0.095, 0.129, 0.0, 0.0, 0.085],
  'id': 0},
 {'judul': 'System reminder aktifitas akademik dosen ',
  'skor': [0.0, 1.0, 0.0, 0.0, 0.217, 0.0, 0.27, 0.183, 0.135, 0.0, 0.0],
  'id': 1},
 {'judul': 'Perancangan Sistem Test Komputerisasi dan Pendukung Keputusan Penerimaan Pegawai',
  'skor': [0.0, 0.0, 1.0, 0.0, 0.086, 0.0, 0.0, 0.144, 0.107, 0.125, 0.0],
  'id': 2},
 {'judul': 'Pengolahan Bahasa Alami pada Agen Cerdas Alat Bantu Konsultasi Zakat',
  'skor': [0.211, 0.0, 0.0, 1.0, 0.081, 0.0, 0.101, 0.0, 0.0, 0.0, 0.0],
  'id': 3},
 {'judul': 'Perancangan dan implementasi system pengenalan  jenis kulit hewan unutk kerajinan kulit berbasis system cerdas ',
  'skor': [0.0,
   0.217,
   0.086,
   0.081,
   1.0,
   0.099,
   0.366,
   0.297,
   0.073,
   0.086,
   0.0],
  'id': 4},
 {'judul': 'Pengembangan Template Media Pembelajaran Berbasis F

In [8]:
fix_data = []
for i in range(0,len(kumpul)):
    for j in range(0,len(hasil)):
        if kumpul[i]['skor'] == hasil[j]['skor']:
            print(j,kumpul[i]['judul'],hasil[j]['label'])
            fix_data.append({'id':j,'judul':kumpul[i]['judul'],'cluster':hasil[j]['label']})

In [25]:
for data in range(len(datasetResult["Cluster"])):
    print(datasetResult["Judul"][data])

Penerapan antarmuka bahasa alami dalam pencarian informasi skripsi pada suatu program studi 
System reminder aktifitas akademik dosen 
Perancangan Sistem Test Komputerisasi dan Pendukung Keputusan Penerimaan Pegawai
Pengolahan Bahasa Alami pada Agen Cerdas Alat Bantu Konsultasi Zakat
Perancangan dan implementasi system pengenalan  jenis kulit hewan unutk kerajinan kulit berbasis system cerdas 
Pengembangan Template Media Pembelajaran Berbasis Flash
Membangun system informasi eksekutif (SIE) dengan menggunakan pendekatan system cerdas 
Perancangan visualisasi informasi untuk system evaluasi guru 
Pengembangan Sistem Penilaian Kerja Dosen dengan Aspek IKD dan SKP 2014 Berbasis Online
Perancangan animasi wayang  pendidikan sebagai pengembangan media edukasi sekolah 
Studi Deskriptif: Penyebab dan Bentuk Perilaku Mencontek pada SMP Muhammadiyah se-Kota Yogyakarta Tahun 2012/2013
