In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Parquet dosyasını yükleyelim
df = pd.read_parquet('train-00000-of-00001-0e99e58b23dccc25.parquet')

# Veri setinin bir kısmını inceleyelim
print(df.head())

# Yorumları bir listeye alalım
reviews = df['review_text'].tolist()

# Metin ön işleme
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# Metin madenciliği için CountVectorizer kullanarak vektörize edelim
vectorizer = CountVectorizer(tokenizer=preprocess_text)
X = vectorizer.fit_transform(reviews)

# Latent Dirichlet Allocation (LDA) modeliyle topic (aspect) belirleme
num_topics = 5  # Belirleyeceğimiz aspect sayısı
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# Her bir topic (aspect) için en önemli kelimeleri alalım
feature_names = vectorizer.get_feature_names_out()
aspect_terms = []
for topic_idx, topic in enumerate(lda.components_):
    top_indices = topic.argsort()[:-10 - 1:-1]  # Her aspect için en iyi 10 kelimeyi alalım
    aspect_terms.append([feature_names[i] for i in top_indices])

# Aspectleri yazdıralım
for i, aspect in enumerate(aspect_terms):
    print(f'Aspect {i + 1}: {aspect}')


                                                text  \
0  beautiful accomodations stayed hotel santa mar...   
1  great hotel great location hotel located la ra...   
2  beautiful hotel great location like beautifull...   
3  great deal waikiki trip hawaii outrigger luana...   
4  choose airport hotel busy triparound australia...   

                                              inputs  \
0  {'text': 'beautiful accomodations stayed hotel...   
1  {'text': 'great hotel great location hotel loc...   
2  {'text': 'beautiful hotel great location like ...   
3  {'text': 'great deal waikiki trip hawaii outri...   
4  {'text': 'choose airport hotel busy triparound...   

                       prediction prediction_agent annotation  \
0  [{'label': '4', 'score': 1.0}]          Argilla       None   
1  [{'label': '5', 'score': 1.0}]          Argilla       None   
2  [{'label': '4', 'score': 1.0}]          Argilla       None   
3  [{'label': '4', 'score': 1.0}]          Argilla       None   
4

KeyError: 'review_text'