In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
file_path = "datasets_bukuPengembanganDiri.csv"
df = pd.read_csv(file_path)

In [3]:
# Inisialisasi stop words bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

# Salin DataFrame untuk menyimpan sinopsis asli
df_original = df.copy()

# Proses teks
ps = PorterStemmer()

def preprocess_text(text):
    # Tokenisasi dan stemming menggunakan NLTK
    tokens = word_tokenize(text)
    stemmed_tokens = [ps.stem(token) for token in tokens if token.lower() not in stop_words]
    return ' '.join(stemmed_tokens)

# Preprocessing sinopsis
df['sinopsis'] = df['sinopsis'].apply(preprocess_text)

# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['sinopsis'])

# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [4]:
# Fungsi pencarian
def search_books(query, df, vectorizer, cosine_sim):
    # Preprocessing query
    query = preprocess_text(query)

    # TF-IDF untuk query
    query_vector = vectorizer.transform([query])

    # Cosine similarity antara query dan dokumen
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Indexing, ranking, dan filter similarity > 0
    results = pd.DataFrame({'judul': df['judul'], 'penulis': df['penulis'], 'sinopsis': df_original['sinopsis'], 'similarity': similarities})
    results = results[results['similarity'] > 0]

    if results.empty:
        return "Tidak ada hasil yang ditemukan untuk '{}'".format(query)

    results = results.sort_values(by='similarity', ascending=False)

    return results

In [6]:
# Pencarian

query = "berbicara di depan umum" # <======== Masukan kata kunci di sini
search_results = search_books(query, df, vectorizer, cosine_sim)

if isinstance(search_results, str):
    print(search_results)
else:
    print("Hasil pencarian untuk '", query, "'")
    print(search_results[['judul', 'penulis', 'sinopsis']])

Hasil pencarian untuk ' berbicara di depan umum '
                                                judul      penulis  \
1                              Bicara Itu Ada Seninya  Oh Su Hyang   
14  Seni Berbicara kepada Siapa Saja, Kapan Saja, ...   Larry King   
10                            Segala - galanya Ambyar  Mark Manson   

                                             sinopsis  
1   TAHUKAH ANDA BAHWA BERBICARA ITU ADA SENINYA?\...  
14  “Salah satu hal yang saya pelajari adalah tida...  
10  Dari penulis buku laris dunia Sebuah Seni untu...  
