In [36]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re

In [37]:
# 데이터 불러오기
free_apps = pd.read_csv('Free_apps.csv')
grossing_apps = pd.read_csv('Grossing_apps.csv')
paid_apps = pd.read_csv('Paid_apps.csv')

In [38]:
# 데이터 병합
apps_data = pd.concat([free_apps, grossing_apps, paid_apps])

# 중복된 행 제거
apps_data = apps_data.drop_duplicates(subset=['App ID'])

In [39]:
# 필요한 열만 선택
apps_data = apps_data[['Name', 'Category', 'Description']]

# 결측치 제거
apps_data.dropna(subset=['Name', 'Category', 'Description'], inplace=True)

In [40]:
def clean_description(text):
    text = re.sub(r'\s+', ' ', text)  # 줄 바꿈 제거
    text = re.sub(r'【.*?】', '', text)  # 특수한 텍스트 패턴 제거 (예: 【Fabulous content】)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # 특수문자 제거
    return text.strip()


In [41]:
# 3. Description 전처리 적용
apps_data['Cleaned_Description'] = apps_data['Description'].apply(clean_description)


In [42]:
# 4. 결과 확인
print(apps_data[['Name', 'Description', 'Cleaned_Description']])

                       Name  \
0                PDF Reader   
1                 모바일 건강보험증   
2      All Documents Viewer   
3     A.(에이닷) - 나만의 AI 개인비서   
4                    온누리상품권   
..                      ...   
90              OruxMaps GP   
91           굴삭기 면허시험 시뮬레이터   
92  EVERYDAY 10-Hands Style   
93               Astro Gold   
94  ACRO Classic Wonder T50   

                                          Description  \
0   This is a PDF app for Android and the best vie...   
1   1. The mobile health insurance card app is a p...   
2   Our features include: View Multiple Types of F...   
3   Adot, your own newly evolved AI personal assis...   
4   Onnuri Gift Certificate is a service that allo...   
..                                                ...   
90  You can use OruxMaps for your outdoor activiti...   
91  The app made for the practical test excavators...   
92  A beautiful watch face for every day! Easy to ...   
93  Astro Gold is the professional-level astrology...   


In [43]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

apps_data['Cleaned_Description'] = apps_data['Cleaned_Description'].apply(
    lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])
)

In [44]:
# 전처리된 데이터를 'cleaned_apps_data.csv' 파일로 저장
apps_data.to_csv('cleaned_apps_data.csv', index=False)