In [4]:
import pandas as pd
import ast

def extract_genres(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        return list(genres_dict.values())
    except:
        return []

book_data = []

with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            book_entry = {
                'Book Title': fields[2],
                'Genres': extract_genres(fields[5]),
                'Plot Summary': fields[6]
            }
            book_data.append(book_entry)

books_df = pd.DataFrame(book_data)
books_df.head()


Unnamed: 0,Book Title,Genres,Plot Summary
0,Animal Farm,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,The Plague,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,[],The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,"[Hard science fiction, Science Fiction, Specul...",The novel posits that space around the Milky ...


In [6]:
metadata_df = pd.read_csv(
    './moviesummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './moviesummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres)
merged = pd.merge(metadata_df, plot_df, on='ID')

movies_df = merged[['Movie Name', 'Genres', 'Plot Summary']]
movies_df.columns = ['Movie Name', 'Genre List', 'Plot Summary']

movies_df.head()

Unnamed: 0,Movie Name,Genre List,Plot Summary
0,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,White Of The Eye,"[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...
2,A Woman in Flames,[Drama],"Eva, an upper class housewife, becomes frustra..."
3,The Sorcerer's Apprentice,"[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns..."
4,Little city,"[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a..."


In [7]:
books_df.to_csv('book_data.txt', sep=' ', index=False, header=False)
movies_df.to_csv('movie_data.txt', sep=' ', index=False, header=False)