In [33]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [3]:
# CSV 파일에서 데이터 불러오기
apps_data = pd.read_csv('cleaned_apps_data.csv')

In [4]:
# 필요한 열을 사용하여 데이터 확인
print(apps_data[['Rank', 'App ID', 'Name', 'Category', 'Cleaned_Description']].head())

# 카테고리 라벨을 정수로 변환 (Fine-Tuning을 위해)
category_to_label = {cat: idx for idx, cat in enumerate(apps_data['Category'].unique())}
apps_data['Label'] = apps_data['Category'].map(category_to_label)

print(apps_data[['Category', 'Label']].head())

   Rank                          App ID                           Name  \
0     1                      kr.or.nhiq                      모바일 건강보험증   
1     2  com.ss.android.ugc.tiktok.lite                    TikTok Lite   
2     3            com.einnovation.temu  Temu: Shop Like a Billionaire   
3     4         com.instagram.barcelona                        Threads   
4     5                  kr.or.ggaction                  경기도 기후행동 기회소득   

                  Category                                Cleaned_Description  
0         Health & Fitness  1 mobile health insurance card app public serv...  
1  Video Players & Editors  tiktok lite global video community fun find co...  
2                 Shopping  shop temu exclusive offers matter youre lookin...  
3                   Social  say threads instagrams textbased conversation ...  
4                Lifestyle  carbon neutrality activities easily practiced ...  
                  Category  Label
0         Health & Fitness      0
1  Vide

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [7]:
# BERT 데이터셋 클래스 정의
class AppDataset(Dataset):
    def __init__(self, descriptions, labels, tokenizer, max_len):
        self.descriptions = descriptions
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        desc = self.descriptions[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            desc,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [8]:
# 데이터셋 및 데이터 로더 준비
train_descriptions, val_descriptions, train_labels, val_labels = train_test_split(
    apps_data['Cleaned_Description'].values, apps_data['Label'].values, test_size=0.2, random_state=42
)

train_dataset = AppDataset(train_descriptions, train_labels, tokenizer, max_len=128)
val_dataset = AppDataset(val_descriptions, val_labels, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [9]:
# BERT 모델 준비
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(category_to_label))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=5e-5)



In [11]:
# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [12]:
# 학습 함수 정의
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

In [13]:
# 평가 함수 정의
def eval_model(model, data_loader, device):
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset)

In [17]:
epochs = 3  # 학습 반복 횟수
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss: {train_loss}, Train accuracy: {train_acc}')

    val_acc = eval_model(model, val_loader, device)
    print(f'Validation accuracy: {val_acc}')

Epoch 1/3
Train loss: 2.4362252950668335, Train accuracy: 0.15384615384615385
Validation accuracy: 0.0
Epoch 2/3
Train loss: 2.2602471113204956, Train accuracy: 0.3076923076923077
Validation accuracy: 0.0
Epoch 3/3
Train loss: 2.13934862613678, Train accuracy: 0.3076923076923077
Validation accuracy: 0.0


In [18]:
# 새로운 설명에 대한 카테고리 예측 함수
def predict(model, tokenizer, text, device):
    model.eval()

    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, prediction = torch.max(logits, dim=1)

    return prediction.item()

In [19]:
# 예측 예시
new_description = "A fitness app that helps you track your workouts and progress."
predicted_label = predict(model, tokenizer, new_description, device)
predicted_category = list(category_to_label.keys())[list(category_to_label.values()).index(predicted_label)]
print(f"Predicted category: {predicted_category}")

Predicted category: Personalization


In [21]:
# 모델을 저장할 경로 설정
model.save_pretrained('fine_tuned_bert')  # 로컬 경로에 저장

In [22]:
# 저장된 모델을 로드하는 방법
model = BertModel.from_pretrained('./fine_tuned_bert')  # 저장된 경로에서 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [39]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from sentence_transformers import SentenceTransformer

# Fine-Tuning된 BERT 모델 로드 (분류 작업에서 학습된 모델)
# model = BertModel.from_pretrained('fine_tuned_bert')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Sentence-BERT를 사용하여 임베딩 생성
model = SentenceTransformer('all-MiniLM-L6-v2')  # 더 좋은 문장 임베딩을 생성할 수 있는 모델
embeddings = model.encode(apps_data['Cleaned_Description'].values)

# 임베딩 추출 함수
def get_bert_embeddings(texts, model, tokenizer, max_len=128):
    model.eval()
    embeddings = []
    
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=max_len,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )

            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # [CLS] 토큰의 임베딩을 사용 (첫 번째 토큰)
            cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embedding.flatten())

    return np.array(embeddings)

# 앱 설명에 대한 임베딩 생성
embeddings = get_bert_embeddings(apps_data['Cleaned_Description'], model, tokenizer)



TypeError: forward() got an unexpected keyword argument 'input_ids'

In [None]:
from sklearn.metrics import silhouette_score

In [35]:
# 최적의 클러스터 수 찾기
best_n_clusters = 0
best_silhouette = -1
for n_clusters in range(2, 10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_n_clusters = n_clusters

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [36]:
print(f"Best number of clusters: {best_n_clusters} with silhouette score: {best_silhouette}")

Best number of clusters: 3 with silhouette score: 0.10557735711336136


In [37]:
# K-Means 클러스터링 수행
n_clusters = 3  # 클러스터 수 설정
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# 클러스터 결과를 데이터프레임에 추가
apps_data['Cluster'] = clusters

# 결과 확인
print(apps_data[['Category', 'Cleaned_Description', 'Cluster']])


                   Category  \
0          Health & Fitness   
1   Video Players & Editors   
2                  Shopping   
3                    Social   
4                 Lifestyle   
5             Entertainment   
6          News & Magazines   
7                   Finance   
8              Food & Drink   
9             Entertainment   
10          Personalization   
11                    Tools   
12          Personalization   
13          Personalization   
14              Photography   
15             Productivity   
16          Personalization   

                                  Cleaned_Description  Cluster  
0   1 mobile health insurance card app public serv...        1  
1   tiktok lite global video community fun find co...        0  
2   shop temu exclusive offers matter youre lookin...        0  
3   say threads instagrams textbased conversation ...        0  
4   carbon neutrality activities easily practiced ...        1  
5   enjoy coupang plays unlimited moviestv shows c.

  super()._check_params_vs_input(X, default_n_init=10)


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF 기반 주요 키워드 추출 함수
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = vectorizer.fit_transform(apps_data['Cleaned_Description'])

def get_top_keywords(tfidf_matrix, clusters, top_n=5):
    cluster_centers = np.zeros((np.unique(clusters).size, tfidf_matrix.shape[1]))
    
    for cluster in np.unique(clusters):
        cluster_centers[cluster] = tfidf_matrix[clusters == cluster].mean(axis=0)
    
    terms = vectorizer.get_feature_names_out()
    top_keywords = []
    
    for cluster in range(cluster_centers.shape[0]):
        center = cluster_centers[cluster]
        top_indices = center.argsort()[::-1][:top_n]
        keywords = [terms[i] for i in top_indices]
        top_keywords.append(keywords)
    
    return top_keywords

# 클러스터별 상위 5개 키워드 추출
top_keywords_per_cluster = get_top_keywords(tfidf_matrix, clusters, top_n=5)

# 클러스터별 상위 카테고리 이름 생성
def generate_category_name(keywords):
    return " & ".join(keywords[:2])  # 상위 2개의 키워드를 연결하여 이름 생성

# 클러스터별 상위 카테고리 이름 생성 및 출력
for cluster_num, keywords in enumerate(top_keywords_per_cluster):
    category_name = generate_category_name(keywords)
    print(f"Cluster {cluster_num} Name: {category_name}")

# 클러스터 이름을 데이터프레임에 추가
for cluster_num, keywords in enumerate(top_keywords_per_cluster):
    category_name = generate_category_name(keywords)
    apps_data.loc[apps_data['Cluster'] == cluster_num, 'Super_Category'] = category_name

# 결과 확인
print(apps_data[['Category', 'Cleaned_Description', 'Super_Category']])


Cluster 0 Name: coupang & tv
Cluster 1 Name: watch & face
Cluster 2 Name: game & hwamin
                   Category  \
0          Health & Fitness   
1   Video Players & Editors   
2                  Shopping   
3                    Social   
4                 Lifestyle   
5             Entertainment   
6          News & Magazines   
7                   Finance   
8              Food & Drink   
9             Entertainment   
10          Personalization   
11                    Tools   
12          Personalization   
13          Personalization   
14              Photography   
15             Productivity   
16          Personalization   

                                  Cleaned_Description Super_Category  
0   1 mobile health insurance card app public serv...   watch & face  
1   tiktok lite global video community fun find co...   coupang & tv  
2   shop temu exclusive offers matter youre lookin...   coupang & tv  
3   say threads instagrams textbased conversation ...   coupang & tv 