# Riss 'rare diseases' 학술정보 스크래핑 data
- 방법 1) 주제어 뽑아서 topic 분류 후 지도학습 / 의미있는 topic분류가 안된다면
- 방법 2) 수동 분류 index 부여해서 지도학습

In [1]:
import pandas as pd
import numpy as np

In [None]:
df_Riss_research = pd.read_csv("./csv/Seleniums.eng_academic_research2.csv")
df_Riss_research.drop(labels='_id', axis=1, inplace=True)
df_Riss_research

# 방법 1) 주제어 뽑아서 topic 분류 후 지도학습

## 데이터 전처리

### dataframe 내 중복되는 학술정보 제거

In [None]:
df_Riss_research['research_title'].value_counts()

In [None]:
df_Riss_research.drop_duplicates(subset="research_title", keep='first', inplace=True)
df_Riss_research['research_title'].value_counts()
# 중복값 제거 완료 확인

In [None]:
df_Riss_research.reset_index(drop=True, inplace=True)

### 영문 text만 남기기

In [None]:
import re
def no_korean(text):
    patterns = '([가-힣]|[一-龥]|[0-9]|[;])'
    text_regex = re.sub(pattern=patterns, repl=' ', string=text)
    return text_regex
df_Riss_research['research_subject'] = df_Riss_research['research_subject'].apply(no_korean)

### 대문자 -> 소문자

In [None]:
df_Riss_research['research_subject'] = df_Riss_research['research_subject'].str.lower()

### 불용어 제거

In [None]:
f=open('./csv/eng_academic_research_stopwords.txt')
stopwords=[]
lines = f.readlines()
for line in lines:
    line = line.strip()
    stopwords.append(line)
f.close()

### Tfidfvectorizer

In [None]:
eng_subject = df_Riss_research['research_subject'].tolist()
eng_subject

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(stop_words=stopwords
                                  , ngram_range=(1,2)
                                  , max_df=0.95
                                  , min_df=2)  # stop_words는 vocabulary에서 필요없는 단어를 빼주는 것. ngram_range는 단어를 붙여주는 것으로 2개의 단어가 합성되었을때 의미를 가지고, 떨어져있을때 의미가 상실되는 경우를 포함함.
result_vectors = tfidfVectorizer.fit_transform(eng_subject)  # fit & transform은 다른 2가지 임.(fit하면 각 단어의 vocabulary 만들 수 있음.)
result_vectors.toarray()[:2]

In [None]:
tfidfVectorizer.vocabulary_ 

### LDA : topic 모델링

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=3, n_jobs=-1)
lda_model.fit(result_vectors)

In [None]:
dictionary_list = tfidfVectorizer.get_feature_names_out()
dictionary_list

In [None]:
lda_model.components_

In [None]:
topics_output = lda_model.transform(result_vectors)
df_topics_score = pd.DataFrame(data=topics_output)
df_topics_score

In [None]:
df_topics_score['dominant_topic_number'] = np.argmax(topics_output, axis=1)
df_topics_score['sentences'] = df_Riss_research['research_subject']
df_topics_score

### topic별 word 추출

In [None]:
topics_list = list()
for topic in lda_model.components_:
    df_datas = [topic, dictionary_list]
    df_topics = pd.DataFrame(data=df_datas)
    df_topics= df_topics.T
    df_topics = df_topics.sort_values(0, ascending=False)
    topics_text = ' '.join(df_topics[1].values[:5])
    print(topics_text)
    topics_list.append(topics_text)

topics_list_add = [topics_list, ['Topic0', 'Topic1', 'Topic2']]
df_topics_keywords = pd.DataFrame(topics_list_add)

In [None]:
df_topics_keywords

### LDA 시각화

In [None]:
import pyLDAvis
import pyLDAvis.lda_model

In [None]:
vis = pyLDAvis.lda_model.prepare(lda_model, result_vectors, tfidfVectorizer)

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)