In [73]:
import pandas as pd
import ast
import itertools #https://stackoverflow.com/questions/952914/how-do-i-make-a-flat-list-out-of-a-list-of-lists

def extract_genres(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        return list(genres_dict.values())
    except:
        return []
    
def extract_genres_omit_empty(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        return list(genres_dict.values())
    except:
        return None
    
def extract_genres_normalized(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        input = list(genres_dict.values())
        out = []
        for i in input: #Remove genres not listed in master genre list
            if i in common_genre_set:
                out.append(i)
        return out if len(out) > 0 else None
    except:
        return None

#GENRE NORMALIZATION
#Get all genres from books
raw_book_genres = []
with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            raw_book_genres.append(extract_genres(fields[5]))

# print(raw_book_genres)
list_of_all_book_genres = list(itertools.chain(*raw_book_genres))
set_of_book_genres = set(list_of_all_book_genres)

#Getting movie genres
metadata_df = pd.read_csv(
    './moviesummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './moviesummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres)
raw_movie_genres = list(metadata_df['Genres'])
list_of_movie_genres = list(itertools.chain(*raw_movie_genres))
set_of_movie_genres = set(list_of_movie_genres)

common_genre_set = set_of_movie_genres & set_of_book_genres
print(common_genre_set)
print(len(common_genre_set))



#Previous code-----------
book_data = []

with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            book_entry = {
                'Book Title': fields[2],
                'Genre': extract_genres_omit_empty(fields[5]),
                'Plot Summary': fields[6]
            }
            book_data.append(book_entry)

books_df = pd.DataFrame(book_data)
books_df = books_df.dropna(subset=['Genre'], inplace=False)
books_df.head()



{'Anti-war', 'Tragicomedy', 'Suspense', 'Biography', 'Science Fiction', 'Time travel', 'Wuxia', 'History', 'Anthology', 'Whodunit', 'Historical fiction', 'School story', 'Business', 'Romantic comedy', 'Travel', 'Satire', 'Nature', 'Sword and sorcery', 'Alien invasion', 'Pornography', 'Cyberpunk', 'Existentialism', 'Comedy', 'Humour', 'Steampunk', 'Detective fiction', 'Drama', 'Conspiracy fiction', 'Erotica', 'Black comedy', 'Coming of age', 'Fantasy', 'Thriller', 'Music', 'Horror', 'Crime Fiction', 'Space opera', 'Foreign legion', 'Adventure', 'Apocalyptic and post-apocalyptic fiction', 'Albino bias', 'Space western', 'Dystopia', 'Comedy of manners', 'Mystery', 'Education', 'Gay Themed', 'Fairy tale', 'Parody', 'Western', 'Sports', 'Supernatural', 'Anthropology'}
53


Unnamed: 0,Book Title,Genre,Plot Summary
0,Animal Farm,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,The Plague,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...
4,A Fire Upon the Deep,"[Hard science fiction, Science Fiction, Specul...",The novel posits that space around the Milky ...
5,All Quiet on the Western Front,"[War novel, Roman à clef]","The book tells the story of Paul Bäumer, a Ge..."


In [74]:
metadata_df = pd.read_csv(
    './moviesummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './moviesummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres_omit_empty)
merged = pd.merge(metadata_df, plot_df, on='ID')

movies_df = merged[['Movie Name', 'Genres', 'Plot Summary']]
movies_df.columns = ['Movie Name', 'Genre List', 'Plot Summary']
movies_df = movies_df.dropna(subset=['Genre List'], inplace=False)

movies_df.head()

Unnamed: 0,Movie Name,Genre List,Plot Summary
0,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,White Of The Eye,"[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...
2,A Woman in Flames,[Drama],"Eva, an upper class housewife, becomes frustra..."
3,The Sorcerer's Apprentice,"[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns..."
4,Little city,"[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a..."


In [75]:
books_df.to_csv('book_data.txt', sep=' ', index=False, header=False)
movies_df.to_csv('movie_data.txt', sep=' ', index=False, header=False)

In [76]:
#Above but using normalized/common genre set only

#Books
book_data = []

with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            book_entry = {
                'Book Title': fields[2],
                'Genre': extract_genres_normalized(fields[5]),
                'Plot Summary': fields[6]
            }
            book_data.append(book_entry)

books_normalized_genre_df = pd.DataFrame(book_data)
books_normalized_genre_df = books_normalized_genre_df.dropna(subset=['Genre'], inplace=False)

books_normalized_genre_df.head()


Unnamed: 0,Book Title,Genre,Plot Summary
0,Animal Farm,[Satire],"Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"[Science Fiction, Satire]","Alex, a teenager living in near-future Englan..."
2,The Plague,[Existentialism],The text of The Plague is divided into five p...
4,A Fire Upon the Deep,"[Science Fiction, Fantasy]",The novel posits that space around the Milky ...
6,A Wizard of Earthsea,[Fantasy],"Ged is a young boy on Gont, one of the larger..."


In [77]:
#Movies
metadata_df = pd.read_csv(
    './moviesummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './moviesummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres_normalized)
merged = pd.merge(metadata_df, plot_df, on='ID')

movies_normalized_genre_df = merged[['Movie Name', 'Genres', 'Plot Summary']]
movies_normalized_genre_df.columns = ['Movie Name', 'Genre List', 'Plot Summary']
#https://saturncloud.io/blog/how-to-delete-rows-with-null-values-in-a-specific-column-in-pandas-dataframe/
movies_normalized_genre_df = movies_normalized_genre_df.dropna(subset=['Genre List'], inplace=False)

movies_normalized_genre_df.head()

Unnamed: 0,Movie Name,Genre List,Plot Summary
0,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,White Of The Eye,[Thriller],A series of murders of rich young women throug...
2,A Woman in Flames,[Drama],"Eva, an upper class housewife, becomes frustra..."
3,The Sorcerer's Apprentice,"[Fantasy, Adventure]","Every hundred years, the evil Morgana returns..."
4,Little city,"[Romantic comedy, Drama, Comedy]","Adam, a San Francisco-based artist who works a..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# 
movie_plot_summaries = movies_df['Plot Summary'].tolist()
book_plot_summaries = books_df['Plot Summary'].tolist()
all_summaries = movie_plot_summaries + book_plot_summaries


In [None]:
# min_df is to ignore vocab that have freq lower than threshhold
# takes words out that appear in more than half the docs
# can think of including bigrams and trigrams
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.9, stop_words='english')

In [None]:
# Learns vocab and idf
# Diff than count which just learns the vocab
# Doc term matrix
vectorizer.fit(all_summaries)



In [None]:
# All of the distinct words
vectorizer.get_feature_names_out()

In [None]:

# 42204 docs and 142364 words
movie_tfidf = vectorizer.transform(movie_plot_summaries)
book_tfidf = vectorizer.transform(book_plot_summaries)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# each row is a movie and each col is a book
# note: similarity_matrix[i, j] will give the similarity between the i-th movie and the j-th book

similarity_matrix = cosine_similarity(movie_tfidf, book_tfidf)

# loop through each movie
for i in range(similarity_matrix.shape[0]):
    # get the indices of the top five books similar to the i-th movie
    top_books_indices = np.argsort(similarity_matrix[i])[::-1][:5]
    
    # print the results
    print(f"Top 5 similar books for movie {i}:")
    for rank, index in enumerate(top_books_indices):
        print(f"{rank + 1}. Book Index: {index}, Similarity Score: {similarity_matrix[i][index]:.4f}")
    print()  