# 라이브러리 설치

In [None]:
# OpenAI
!pip install openai

# Google Translation
!pip install googletrans==4.0.0-rc1

# Word Cloud
!pip install wordcloud

# 단위 기능 구현

## 1. 논문 검색 (semantic scholar)
> https://www.semanticscholar.org/product/api  
> https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_search

In [None]:
# example: perovskite solar cell

import requests

query = "https://api.semanticscholar.org/graph/v1/paper/search?query=photovoltaic+machine+learning&limit=30"
s = requests.get(query)

In [None]:
# 첫번째 논문
s.json()["data"][0]

In [None]:
# 논문 10개 차례대로
for i in range(10):
    title = s.json()["data"][i]["title"]
    print(f"#{i+1}: {title}")

## 2. 초록 받기 (semantic scholar)

In [None]:
query = "https://api.semanticscholar.org/graph/v1/paper/search?query=photovoltaic+machine+learning"
fields = "title,journal,abstract,authors"

query = f"{query}&fields={fields}"
query

In [None]:
s = requests.get(query)

In [None]:
# 두번째 논문
s.json()["data"][1]

In [None]:
# 두번째 초록
abstract = s.json()["data"][2]["abstract"]
abstract

In [None]:
for i in range(10):
    title = s.json()["data"][i]["title"]
    abstract = s.json()["data"][i]["abstract"]
    print(f"#{i+1}: {title}:: {abstract}")

## 3. 초록 요약 (ChatGPT)

In [None]:
import openai
openai.api_key = xxxxx    # xxxxx 대신 본인 OpenAI API Key 입력

response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", 
     "content": f"summarize the following sentences in 20 words\n\n{abstract}"}
  ]
)

In [None]:
summary = response["choices"][0]["message"]["content"]
summary

In [None]:
len(summary.split(" "))

## 4. 한글 번역 (Google Translator)

In [None]:
from googletrans import Translator

def get_translate(prompt):
    google = Translator() # 구글 번역기 사용
    response = google.translate(prompt, dest="ko")  # 한국어로 번역
    prompt_kr = response.text
    return prompt_kr

In [None]:
summary_kr = get_translate(summary)
summary_kr

## 5. 일괄 초록 요약 번역 (python: for)

In [None]:
# 검색 함수
keywords = ["photovoltaic", "machine learning", "forecasting"]

def search(keywords, limit=30, fields=["title", "journal", "abstract", "authors"]):
    keywords_join = "+".join(keywords)
    fields_join = ",".join(fields)
    query = f"https://api.semanticscholar.org/graph/v1/paper/search?query={keywords_join}&limit={limit}&fields={fields_join}"
    s = requests.get(query)
    
    return s.json()

s_json = search(keywords)

In [None]:
# 논문 제목 추출 함수
def get_title(s_json, num=0):
    return s_json["data"][num]["title"]

In [None]:
# 초록 추출 함수
def get_abstract(s_json, num=0):
    return s_json["data"][num]["abstract"]

In [None]:
# 요약 함수

import openai

def get_summary(abstract, words=15):
    if abstract:
        response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", 
            "content": f"summarize the following sentences in {words} words\n\n{abstract}"}
        ]
        )
        summary = response["choices"][0]["message"]["content"]
        return summary
    else:
        return ""

In [None]:
summary = get_summary(abstract)
summary

In [None]:
# 번역 함수

from googletrans import Translator

def get_translate(prompt):
    if prompt:
        google = Translator() # 구글 번역기 사용
        response = google.translate(prompt, dest="ko")  # 한국어로 번역
        prompt_kr = response.text
        return prompt_kr
    else:
        return ""

In [None]:
summary_kr = get_translate(summary)
summary_kr

In [None]:
for i in range(10):
    title = get_title(s_json, i)
    abstract = get_abstract(s_json, i)
    summary = get_summary(abstract, words=30)
    summary_kr = get_translate(summary)
    print(f'#{i+1}: "{title}",\t{summary_kr}')

## 6. 명사 추출, 기본형 변환 (ChatGPT)

In [None]:
def get_keywords(abstract):
    if abstract:
        response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", 
            "content": f"extract nouns, lemmatize and convert plural to singular form (for example, 'solar cells' to 'solar cell') from following text, separated by comma, and remove all other texts or bullets\n\n{abstract}"}
        ]
        )

        keyword = response["choices"][0]["message"]["content"]
        return keyword
    else:
        return []


keyword = get_keywords(abstract)
keyword

In [None]:
keyword_list = [n.lstrip("-").lstrip(" ").rstrip(" ").rstrip(".")  for n in keyword.split(",")]
keyword_list

In [None]:
# list 변환 포함 함수 재정의

def get_keywords(abstract):
    if abstract:
        response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", 
            "content": f"extract nouns, lemmatize and convert plural to singular form (for example, 'solar cells' to 'solar cell') from following text, separated by comma, and remove all other texts or bullets\n\n{abstract}"}
        ]
        )

        keyword = response["choices"][0]["message"]["content"]
        return [n.lstrip("-").lstrip(" ").rstrip(" ").rstrip(".")  for n in keyword.split(",")]
    else:
        return []


keyword = get_keywords(abstract)
keyword

## 7. 데이터 취합, 엑셀 파일 저장 (pandas)

In [None]:
import pandas as pd

# simple example

# 데이터 준비
A = ["apple", "banana", "cherry"]
B = [3000, 2500, 5000]
C = ["대한민국", "필리핀", "터키"]

# DataFrame 작성
df_ex = pd.DataFrame()
df_ex["과일"] = A
df_ex["가격"] = B
df_ex["원산지"] = C

# 엑셀 저장
df_ex.to_excel("df_ex.xlsx")

# 화면 출력
df_ex

In [None]:
# 엑셀 파일로 저장

# 검색 결과에서 title, journal name, volume, pages, abstract를 가져오고 요약, 번역을 각기 list로 저장
titles = []
journal_names = []
journal_volumes = []
journal_pages = []
abstracts = []
summarys = []
summarys_kr = []
keywords = []

### 서지 정보 수집 
# 논문 제목 추출 함수
def get_journal_name(s_json, num=0):
    return s_json["data"][num]["journal"].get("name")

# 논문 volume 추출 함수
def get_journal_volume(s_json, num=0):
    return s_json["data"][num]["journal"].get("volume")

# 논문 pages 추출 함수
def get_journal_page(s_json, num=0):
    return s_json["data"][num]["journal"].get("pages")


### 수집된 논문 데이터를 엑셀로 저장
for i in range(10):
    # 데이터 추출, list 삽입
    title = get_title(s_json, i)
    titles.append(title)

    journal_name = get_journal_name(s_json, i)
    journal_names.append(journal_name) 

    journal_volume = get_journal_volume(s_json, i)
    journal_volumes.append(journal_volume)

    journal_page = get_journal_page(s_json, i)
    journal_pages.append(journal_page)

    abstract = get_abstract(s_json, i)
    abstracts.append(abstract)

    summary = get_summary(abstract, words=30)
    summarys.append(summary)

    summary_kr = get_translate(summary)
    summarys_kr.append(summary_kr)

    keyword = get_keywords(abstract)
    keywords.append(keyword)

# DataFrame 생성, 저장
df_papers = pd.DataFrame()
df_papers["titles"] = titles
df_papers["journal_names"] = journal_names
df_papers["journal_volumes"] = journal_volumes
df_papers["journal_pages"] = journal_pages
df_papers["abstracts"] = abstracts
df_papers["summarys"] = summarys
df_papers["summarys_ko"] = summarys_kr
df_papers["keywords"] = keywords
df_papers.to_excel("df_papers.xlsx")

In [None]:
df_papers

## 8. 워드 클라우드 작성 (WordCloud)

> colormap: https://matplotlib.org/stable/gallery/color/colormap_reference.html

In [None]:
# 키워드 전체 통합

import itertools

keywords_all = list(itertools.chain(*keywords))
print(len(keywords_all))
keywords_all

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 문자열로 변환합니다.
text = ' '.join(keywords_all)

# 워드 클라우드를 생성합니다.
wordcloud = WordCloud(width=800, height=800, 
                                       colormap="Set1", 
                                       background_color='white').generate(text)

# 워드 클라우드 이미지를 출력합니다.
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
fig.savefig("wordcloud.png")

# 키워드와 논문 수 입력, 엑셀파일 생성, WordCloud를 생성 자동화

In [None]:
def get_paperdata(keywords, limit=10, fields=["title", "journal", "abstract", "authors"], words=30, wc_width=1600, wc_height=900, wc_colormap="Set3"):

    # 1. 논문 검색
    print("# 1. Searching Paper....")
    s_json = search(keywords, limit=limit, fields=fields)
    print(f"#     {len(s_json['data'])} papers data retrieved.")

    # 2. 논문들의 데이터를 추출하고 DataFrame으로 저장
    # 검색 결과에서 title, journal name, volume, pages, abstract를 가져오고 요약, 번역을 각기 list로 저장
    titles = []
    journal_names = []
    journal_volumes = []
    journal_pages = []
    abstracts = []
    summarys = []
    summarys_kr = []
    keywords = []

    # 서지 정보 수집 
    print("# 2. Rearranging papers data...")
    for i in range(len(s_json["data"])):
        print(s_json["data"][i])
        # 데이터 추출, list 삽입
        title = get_title(s_json, i)
        titles.append(title)

        try:
            journal_name = get_journal_name(s_json, i)
        except:
            journal_name = None
        journal_names.append(journal_name) 

        try:
            journal_volume = get_journal_volume(s_json, i)
        except:
            journal_volume = None
        journal_volumes.append(journal_volume)

        try:
            journal_page = get_journal_page(s_json, i)
        except:
            journal_page = None
        journal_pages.append(journal_page)

        abstract = get_abstract(s_json, i)
        abstracts.append(abstract)

        summary = get_summary(abstract, words=words)
        summarys.append(summary)

        summary_kr = get_translate(summary)
        summarys_kr.append(summary_kr)

        keyword = get_keywords(abstract)
        keywords.append(keyword)

    # DataFrame 생성, 저장
    print("# 3. Creating Excel file...")
    df_papers = pd.DataFrame({"titles":titles, "journal_names":journal_names, "journal_volumes":journal_volumes, "journal_pages":journal_pages, 
                              "abstracts":abstracts, "summarys":summarys, "summarys_kr":summarys_kr, "keywords":keywords})
    df_papers.to_excel("df_papers.xlsx")
    print(f"#     DONE!")

    # 3. Word Cloud 제작
    keywords_all = list(itertools.chain(*keywords))
    text = ' '.join(keywords_all)
    wordcloud = WordCloud(width=wc_width, height=wc_height, 
                                        colormap=wc_colormap, 
                                        background_color='white').generate(text)

    # 워드 클라우드 이미지를 출력합니다.
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    fig.savefig("wordcloud.png")
    print(f"#     DONE!")

In [None]:
get_paperdata(["perovskite", "stability"])