In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\90505\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\90505\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\90505\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

newsgroups = fetch_20newsgroups(subset='all')

X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)


In [3]:
processed_docs_train = [preprocess(doc) for doc in X_train]
processed_docs_test = [preprocess(doc) for doc in X_test]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# CountVectorizer kullanarak belge terim matrisini oluşturun
count_vectorizer = CountVectorizer(analyzer=lambda x: x)
X_train_counts = count_vectorizer.fit_transform(processed_docs_train)
X_test_counts = count_vectorizer.transform(processed_docs_test)

# TfidfVectorizer kullanarak belge terim matrisini oluşturun
tfidf_vectorizer = TfidfVectorizer(analyzer=lambda x: x, ngram_range=(2,6))
X_train_tfidf = tfidf_vectorizer.fit_transform(processed_docs_train)
X_test_tfidf = tfidf_vectorizer.transform(processed_docs_test)




In [13]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel

# Gensim corpusu oluşturun
dictionary = corpora.Dictionary(processed_docs_train)
corpus_train = [dictionary.doc2bow(doc) for doc in processed_docs_train]
corpus_test = [dictionary.doc2bow(doc) for doc in processed_docs_test]

# LDA modelini eğitin
lda_model = LdaModel(corpus=corpus_train,
                     id2word=dictionary,
                     num_topics=20,
                     passes=10)

# Eğitim veri kümesi üzerindeki belge temsilcilerini alın
lda_train = lda_model[corpus_train]
lda_test = lda_model[corpus_test]


In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np

# CountVectorizer ve TF-IDF için ayrı ayrı sınıflandırma modeli eğitin
nb_count = MultinomialNB()
nb_count.fit(X_train_counts, y_train)
y_pred_count = nb_count.predict(X_test_counts)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

lda_train_array = np.zeros((len(lda_train), lda_model.num_topics))
for i, doc in enumerate(lda_train):
    for topic, prob in doc:
        lda_train_array[i, topic] = prob
lda_nb = MultinomialNB()
print(lda_train_array)
lda_nb.fit(lda_train_array, y_train)
lda_test_array = np.zeros((len(lda_test), lda_model.num_topics))
for i, doc in enumerate(lda_test):
    for topic, prob in doc:
        lda_test_array[i, topic] = prob
y_pred_lda = lda_nb.predict(lda_test_array)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[[0.         0.         0.09704007 ... 0.         0.         0.        ]
 [0.         0.         0.78262579 ... 0.         0.         0.        ]
 [0.04668192 0.         0.62100452 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.04591082 0.        ]
 [0.         0.         0.04704035 ... 0.         0.         0.01710424]]


  y = column_or_1d(y, warn=True)


In [18]:
print("CountVectorizer accuracy:", accuracy_score(y_test, y_pred_count))
print("TF-IDF accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("LDA accuracy:", accuracy_score(y_test, y_pred_lda))

CountVectorizer accuracy: 0.8774535809018568
TF-IDF accuracy: 0.8726790450928382
LDA accuracy: 0.5679045092838196


In [19]:
top_topics = lda_model.show_topics(num_topics=10, num_words=10, formatted=False)

for i, topic in enumerate(top_topics):
    print(f"Konu {i}: {' '.join([word[0] for word in topic[1]])}")

Konu 0: writes line article organization subject bike dod year run hit
Konu 1: max q n r g p bhj giz wm w
Konu 2: gun would people writes article line organization right subject government
Konu 3: window card do driver mode version video m ibm microsoft
Konu 4: game team player line organization subject year university hockey fan
Konu 5: san health number center rate research page detector ray april
Konu 6: car engine mile brake tire speed new dealer ford wheel
Konu 7: armenian turkish turk armenia greek russian turkey serdar argic greece
Konu 8: drug georgia tobacco michael vote university program government steve libertarian
Konu 9: god one would christian people say writes subject think line


In [20]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(lda_model, corpus_train, dictionary)
pyLDAvis.display(vis_data)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
