# chatGPT를 이용한 토픽모델링
### langchain 라이브러리

In [2]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0.1, api_key="...",model="gpt-4-turbo")


### 데이터 전처리

In [3]:
import pandas as pd

def read_abstracts_from_xls(file_path):
    try:
        # CSV 파일 읽기
        df = pd.read_excel(file_path)
        # Abstract 컬럼의 값들을 리스트로 저장
        abstracts = df['Abstract'].tolist()
        
        return abstracts
    except Exception as e:
        print("Error:", e)
        return []

# CSV 파일 경로 설정
file_path = "savedrecs.xls"

# Abstract 컬럼 값들을 리스트로 읽어오기
abstract_list = read_abstracts_from_xls(file_path)

# 결과 출력
print(abstract_list)





In [4]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import re

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


docs_df = pd.DataFrame({'document':abstract_list})

def remove_special_characters(text):
    # 알파벳이 아닌 문자를 공백으로 대체
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text
# 특수 문자 제거
docs_df['clean_doc'] = docs_df['document'].apply(remove_special_characters)
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
docs_df['clean_doc'] = docs_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
docs_df['clean_doc'] = docs_df['clean_doc'].apply(lambda x: x.lower())

docs_df['clean_doc']

0      oral health older people living residential ag...
1      this paper explores health care purchasing coa...
2      introduction having health care home been show...
3      role health care inequalities social inequalit...
4      purpose examine performance properties revised...
                             ...                        
295    literature highlights that disparities health ...
296    affordable care greatly expanded health care c...
297    objectivethe objective study evaluate effectiv...
298    this paper introduces this mini series vertica...
299    backgroundan asymmetrical oral disease burden ...
Name: clean_doc, Length: 300, dtype: object

### 프롬프트 작성

In [6]:
from langchain.prompts import  ChatPromptTemplate

template = ChatPromptTemplate.from_messages([
    ("system",  '''
     Given a list of documents from the user, please extract nine topics that can categorize these documents. Additionally,for each topics, provide five influential words and brief explanation for that topic.  
     
     ex)  topic : topic1, topic2, topic3 ...
            topic 1: influencial word1,influencial word2, influencial word3, influencial word4, influencial word5
                desciption...
            topic 2: influencial word1,influencial word2, influencial word3, influencial word4, influencial word5
                desciption...
            topic 3: influencial word1,influencial word2, influencial word3, influencial word4, influencial word5
                desciption...
            ...
     '''),
    
    ("human", "{docs}"),
])

prompt = template.format_messages(docs =docs_df['clean_doc'] )

llm.predict_messages(prompt).content


'To categorize the provided documents into topics, I will analyze the content and extract common themes. Here are nine potential topics based on the recurring themes and keywords in the document excerpts:\n\n### Topic 1: Oral Health in Aging Populations\n- **Influential Words:** oral, health, older, residential, disease\n- **Description:** This topic focuses on the challenges and conditions related to oral health among elderly individuals, particularly those living in residential care facilities. It may cover prevalence studies, treatment options, and preventive measures.\n\n### Topic 2: Health Care Policy and Purchasing\n- **Influential Words:** health, care, policy, purchasing, cost\n- **Description:** Discussions under this topic revolve around the strategies and policies for health care purchasing, including cost management, insurance policies, and governmental regulations affecting health care procurement.\n\n### Topic 3: Health Care Accessibility and Home Care\n- **Influential Wo