In [115]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from transformers import AdamW

In [116]:
# 데이터 불러오기
apps_data = pd.read_csv('cleaned_apps_data.csv')

In [117]:
# 필요한 열만 선택
apps_data = apps_data[['Category', 'Cleaned_Description']]

In [118]:
apps_data.head()

Unnamed: 0,Category,Cleaned_Description
0,Health & Fitness,1 mobile health insurance card app public serv...
1,Video Players & Editors,tiktok lite global video community fun find co...
2,Shopping,shop temu exclusive offers matter youre lookin...
3,Social,say threads instagrams textbased conversation ...
4,Lifestyle,carbon neutrality activities easily practiced ...


## BERT - RoBert

In [119]:
## RoBert
from transformers import RobertaTokenizer, RobertaModel


In [120]:

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [121]:
# 2. BERT 모델과 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [67]:
# 3. BERT 임베딩 생성 함수
def get_bert_embeddings(texts):
    embeddings = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embedding.flatten())
    return np.array(embeddings)

In [68]:
# 4. 앱 설명에 대한 BERT 임베딩 생성
bert_embeddings = get_bert_embeddings(apps_data['Cleaned_Description'])


#### Optimal Cluster

In [103]:
from sklearn.metrics import silhouette_score

In [122]:
# 최적의 클러스터 수 찾기
best_n_clusters = 0
best_silhouette = -1
for n_clusters in range(2, 10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(bert_embeddings)
    silhouette_avg = silhouette_score(bert_embeddings, labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_n_clusters = n_clusters

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [123]:
print(f"Best number of clusters: {best_n_clusters} with silhouette score: {best_silhouette}")

Best number of clusters: 2 with silhouette score: 0.09935969114303589


In [132]:
# 5. K-Means 클러스터링 수행
n_clusters = 2  # 상위 카테고리 개수 설정
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(bert_embeddings)

# 클러스터 결과를 데이터프레임에 추가
apps_data['Cluster'] = clusters

  super()._check_params_vs_input(X, default_n_init=10)


#### DBSCAN Clustering -> Nope

In [133]:
from sklearn.cluster import DBSCAN

In [126]:
dbscan = DBSCAN(eps=0.5, min_samples=100)
clusters = dbscan.fit_predict(bert_embeddings)

In [127]:
# 6. TF-IDF 기반 주요 키워드 추출 함수
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = vectorizer.fit_transform(apps_data['Cleaned_Description'])

def get_top_keywords(tfidf_matrix, clusters, top_n=5):
    cluster_centers = np.zeros((np.unique(clusters).size, tfidf_matrix.shape[1]))
    
    for cluster in np.unique(clusters):
        cluster_centers[cluster] = tfidf_matrix[clusters == cluster].mean(axis=0)
    
    terms = vectorizer.get_feature_names_out()
    top_keywords = []
    
    for cluster in range(cluster_centers.shape[0]):
        center = cluster_centers[cluster]
        top_indices = center.argsort()[::-1][:top_n]
        keywords = [terms[i] for i in top_indices]
        top_keywords.append(keywords)
    
    return top_keywords


In [128]:
# 7. 클러스터별 상위 5개 키워드 추출
top_keywords_per_cluster = get_top_keywords(tfidf_matrix, clusters, top_n=5)

In [129]:
# 8. 키워드 기반으로 상위 카테고리 이름 생성
def generate_category_name(keywords):
    return " & ".join(keywords[:2])  # 상위 2개의 키워드를 연결하여 이름 생성

In [130]:
# 9. 클러스터별 상위 카테고리 이름 생성 및 출력
for cluster_num, keywords in enumerate(top_keywords_per_cluster):
    category_name = generate_category_name(keywords)
    print(f"Cluster {cluster_num} Name: {category_name}")

Cluster 0 Name: watch & face


In [131]:
# 10. 클러스터 이름을 데이터프레임에 추가
for cluster_num, keywords in enumerate(top_keywords_per_cluster):
    category_name = generate_category_name(keywords)
    apps_data.loc[apps_data['Cluster'] == cluster_num, 'Super_Category'] = category_name



In [114]:
# 결과 확인
print(apps_data[['Category', 'Cleaned_Description', 'Super_Category']])

                   Category  \
0          Health & Fitness   
1   Video Players & Editors   
2                  Shopping   
3                    Social   
4                 Lifestyle   
5             Entertainment   
6          News & Magazines   
7                   Finance   
8              Food & Drink   
9             Entertainment   
10          Personalization   
11                    Tools   
12          Personalization   
13          Personalization   
14              Photography   
15             Productivity   
16          Personalization   

                                  Cleaned_Description      Super_Category  
0   1 mobile health insurance card app public serv...  insurance & tiktok  
1   TikTok Lite global video community fun find co...  insurance & tiktok  
2   Shop Temu exclusive offers matter youre lookin...  insurance & tiktok  
3   Say Threads Instagrams textbased conversation ...        news & watch  
4   carbon neutrality activities easily practiced ...        