In [1]:
import pandas as pd

In [2]:
dt = pd.read_csv('data/booksummaries_with_parsed_genres.csv')
dt.head()

Unnamed: 0,Wikipedia article ID,Freebase ID,Book title,Author,Publication date,Book genres (Freebase ID:name tuples),Plot summary,Summarized Plot Summary,Parsed Genres
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","Old Major, an old boar, inspires a farm animal...","Roman à clef, Satire, Children's literature, S..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","In a near-future England, teenager Alex leads ...","Science Fiction, Novella, Speculative fiction,..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"In Oran, a plague begins with rats dying in th...","Existentialism, Fiction, Absurdist fiction, Novel"
3,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,The novel explores a galaxy divided into Zones...,"Hard science fiction, Science Fiction, Specula..."
4,2152,/m/0x5g,All Quiet on the Western Front,Erich Maria Remarque,1929-01-29,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...","Paul Bäumer, a young German soldier in WWI, jo...","War novel, Roman à clef"


In [3]:
import os
from openai import OpenAI
from dotenv import load_dotenv
client = OpenAI()
# 환경 변수 로드
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
def embed_text(text):
    response = client.embeddings.create(
        model="text-embedding-3-large",  # 임베딩 모델
        input=text
    )
    return response.data[0].embedding

In [4]:
import time
from tqdm import tqdm

# 줄거리를 임베딩하고 새로운 컬럼에 저장
def embed_parsed_genres(df):
    embeddings = []
    
    # tqdm을 사용하여 진행 상황 표시
    for genre in tqdm(df['Parsed Genres'], desc="임베딩 진행 중"):
        try:
            # Parsed Genres를 임베딩
            embedding = embed_text(genre)
            embeddings.append(embedding)
            time.sleep(0.5)  # OpenAI API 과부하 방지
        except Exception as e:
            print(f"임베딩 실패: {e}")
            embeddings.append(None)
    
    # 새로운 컬럼에 임베딩 결과 저장
    df['Parsed Genres Embedding'] = embeddings
    return df




In [5]:
# 실행
dt = embed_parsed_genres(dt)

임베딩 진행 중: 100%|██████████| 12839/12839 [3:54:46<00:00,  1.10s/it]  


In [6]:
dt.head()

Unnamed: 0,Wikipedia article ID,Freebase ID,Book title,Author,Publication date,Book genres (Freebase ID:name tuples),Plot summary,Summarized Plot Summary,Parsed Genres,Parsed Genres Embedding
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","Old Major, an old boar, inspires a farm animal...","Roman à clef, Satire, Children's literature, S...","[-0.0268506221473217, 0.01663643680512905, -0...."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","In a near-future England, teenager Alex leads ...","Science Fiction, Novella, Speculative fiction,...","[-0.022291284054517746, 0.01301165483891964, -..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"In Oran, a plague begins with rats dying in th...","Existentialism, Fiction, Absurdist fiction, Novel","[-0.006543333642184734, 0.014447270892560482, ..."
3,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,The novel explores a galaxy divided into Zones...,"Hard science fiction, Science Fiction, Specula...","[-0.008878524415194988, 0.024636436253786087, ..."
4,2152,/m/0x5g,All Quiet on the Western Front,Erich Maria Remarque,1929-01-29,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...","Paul Bäumer, a young German soldier in WWI, jo...","War novel, Roman à clef","[-0.03799491003155708, -0.03112509287893772, -..."


In [7]:
import os

# data 폴더 경로
output_folder = 'data'

# 저장할 파일 경로
output_file = os.path.join(output_folder, 'parsed_genres_with_embeddings.csv')

# 데이터프레임을 CSV 파일로 저장
dt.to_csv(output_file, index=False, encoding='utf-8')

print(f"데이터프레임이 '{output_file}' 파일로 저장되었습니다.")



데이터프레임이 'data\parsed_genres_with_embeddings.csv' 파일로 저장되었습니다.


In [9]:
dt.shape

(12839, 10)