In [12]:
import pandas as pd

# 파일 경로 리스트
file_paths = [
    'data/booksummaries_summarized_0_3000.csv',
    'data/booksummaries_summarized_3001_9000.csv',
    'data/booksummaries_summarized_9001_16559.csv'
]

# 각 파일을 데이터프레임으로 불러오고 리스트에 저장
df_list = [pd.read_csv(file) for file in file_paths]

# 데이터프레임 병합
df_combined = pd.concat(df_list, ignore_index=True)
print(df_combined.head())
# 병합 결과 확인
print(df_combined.shape)  # (총 행 수, 컬럼 수)




   Wikipedia article ID Freebase ID  \
0                   620     /m/0hhy   
1                   843     /m/0k36   
2                   986     /m/0ldx   
3                  1756     /m/0sww   
4                  2080     /m/0wkt   

                                  Book title           Author  \
0                                Animal Farm    George Orwell   
1                         A Clockwork Orange  Anthony Burgess   
2                                 The Plague     Albert Camus   
3  An Enquiry Concerning Human Understanding       David Hume   
4                       A Fire Upon the Deep     Vernor Vinge   

  Publication date              Book genres (Freebase ID:name tuples)  \
0       1945-08-17  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1             1962  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2             1947  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
3              NaN                                                NaN   
4        

In [13]:
df_combined.head()

Unnamed: 0,Wikipedia article ID,Freebase ID,Book title,Author,Publication date,Book genres (Freebase ID:name tuples),Plot summary,Summarized Plot Summary
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","Old Major, an old boar, inspires a farm animal..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","In a near-future England, teenager Alex leads ..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"In Oran, a plague begins with rats dying in th..."
3,1756,/m/0sww,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...,"David Hume's ""Enquiry"" explores the foundation..."
4,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,The novel explores a galaxy divided into Zones...


In [14]:
# 'Plot summary', 'Summarized Plot Summary', 'Book genres' 컬럼에서 NaN 값 제거
df_cleaned = df_combined.dropna(subset=['Plot summary', 'Summarized Plot Summary', 'Book genres (Freebase ID:name tuples)'])

# 추가로 공백("")인 경우도 제거
df_cleaned = df_cleaned[
    (df_cleaned['Plot summary'].str.strip() != '') &
    (df_cleaned['Summarized Plot Summary'].str.strip() != '') &
    (df_cleaned['Book genres (Freebase ID:name tuples)'].str.strip() != '')
]

# 결과 확인
print(f"전처리 전 데이터 크기: {df_combined.shape}")
print(f"전처리 후 데이터 크기: {df_cleaned.shape}")
print(df_cleaned[['Book title', 'Plot summary', 'Summarized Plot Summary', 'Book genres (Freebase ID:name tuples)']].head())


전처리 전 데이터 크기: (16557, 8)
전처리 후 데이터 크기: (12839, 8)
                       Book title  \
0                     Animal Farm   
1              A Clockwork Orange   
2                      The Plague   
4            A Fire Upon the Deep   
5  All Quiet on the Western Front   

                                        Plot summary  \
0   Old Major, the old boar on the Manor Farm, ca...   
1   Alex, a teenager living in near-future Englan...   
2   The text of The Plague is divided into five p...   
4   The novel posits that space around the Milky ...   
5   The book tells the story of Paul Bäumer, a Ge...   

                             Summarized Plot Summary  \
0  Old Major, an old boar, inspires a farm animal...   
1  In a near-future England, teenager Alex leads ...   
2  In Oran, a plague begins with rats dying in th...   
4  The novel explores a galaxy divided into Zones...   
5  Paul Bäumer, a young German soldier in WWI, jo...   

               Book genres (Freebase ID:name tuples) 

In [16]:
# 'Book genres (Freebase ID:name tuples)' 컬럼 상위 5개 출력
df_cleaned['Book genres (Freebase ID:name tuples)'].head()


0    {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...
1    {"/m/06n90": "Science Fiction", "/m/0l67h": "N...
2    {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...
4    {"/m/03lrw": "Hard science fiction", "/m/06n90...
5    {"/m/098tmk": "War novel", "/m/016lj8": "Roman...
Name: Book genres (Freebase ID:name tuples), dtype: object

In [18]:
import ast  # 문자열을 딕셔너리로 변환

# 장르 이름만 추출하는 함수
def extract_genres(genre_str):
    try:
        genre_dict = ast.literal_eval(genre_str)  # 문자열 → 딕셔너리 변환
        return ', '.join(genre_dict.values())     # 장르 이름만 추출 후 합치기
    except (ValueError, SyntaxError):
        return genre_str  # 변환 실패 시 원본 반환

# 전체 데이터에 전처리 적용하여 새로운 컬럼 추가
df_cleaned['Parsed Genres'] = df_cleaned['Book genres (Freebase ID:name tuples)'].apply(extract_genres)

# 결과 확인 (상위 5개)
print(df_cleaned[['Book title', 'Book genres (Freebase ID:name tuples)', 'Parsed Genres']].head())


                       Book title  \
0                     Animal Farm   
1              A Clockwork Orange   
2                      The Plague   
4            A Fire Upon the Deep   
5  All Quiet on the Western Front   

               Book genres (Freebase ID:name tuples)  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   
5  {"/m/098tmk": "War novel", "/m/016lj8": "Roman...   

                                       Parsed Genres  
0  Roman à clef, Satire, Children's literature, S...  
1  Science Fiction, Novella, Speculative fiction,...  
2  Existentialism, Fiction, Absurdist fiction, Novel  
4  Hard science fiction, Science Fiction, Specula...  
5                            War novel, Roman à clef  


In [19]:
df_cleaned.head()

Unnamed: 0,Wikipedia article ID,Freebase ID,Book title,Author,Publication date,Book genres (Freebase ID:name tuples),Plot summary,Summarized Plot Summary,Parsed Genres
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","Old Major, an old boar, inspires a farm animal...","Roman à clef, Satire, Children's literature, S..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","In a near-future England, teenager Alex leads ...","Science Fiction, Novella, Speculative fiction,..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"In Oran, a plague begins with rats dying in th...","Existentialism, Fiction, Absurdist fiction, Novel"
4,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,The novel explores a galaxy divided into Zones...,"Hard science fiction, Science Fiction, Specula..."
5,2152,/m/0x5g,All Quiet on the Western Front,Erich Maria Remarque,1929-01-29,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...","Paul Bäumer, a young German soldier in WWI, jo...","War novel, Roman à clef"


In [20]:
# CSV 파일 경로 설정
save_path = 'data/booksummaries_with_parsed_genres.csv'

# 데이터프레임을 CSV로 저장
df_cleaned.to_csv(save_path, index=False, encoding='utf-8-sig')

print(f"파일이 성공적으로 저장되었습니다: {save_path}")


파일이 성공적으로 저장되었습니다: data/booksummaries_with_parsed_genres.csv
