In [2]:
import pandas as pd
import ast

def extract_genres(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        return list(genres_dict.values())
    except:
        return []

book_data = []

with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            book_entry = {
                'Book Title': fields[2],
                'Genres': extract_genres(fields[5]),
                'Plot Summary': fields[6]
            }
            book_data.append(book_entry)

books_df = pd.DataFrame(book_data)
books_df.head()


Unnamed: 0,Book Title,Genres,Plot Summary
0,Animal Farm,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,The Plague,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,[],The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,"[Hard science fiction, Science Fiction, Specul...",The novel posits that space around the Milky ...


In [3]:
metadata_df = pd.read_csv(
    './moviesummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './moviesummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres)
merged = pd.merge(metadata_df, plot_df, on='ID')

movies_df = merged[['Movie Name', 'Genres', 'Plot Summary']]
movies_df.columns = ['Movie Name', 'Genre List', 'Plot Summary']

movies_df.head()

Unnamed: 0,Movie Name,Genre List,Plot Summary
0,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,White Of The Eye,"[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...
2,A Woman in Flames,[Drama],"Eva, an upper class housewife, becomes frustra..."
3,The Sorcerer's Apprentice,"[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns..."
4,Little city,"[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a..."


In [4]:
books_df.to_csv('book_data.txt', sep=' ', index=False, header=False)
movies_df.to_csv('movie_data.txt', sep=' ', index=False, header=False)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
# 
movie_plot_summaries = movies_df['Plot Summary'].tolist()
book_plot_summaries = books_df['Plot Summary'].tolist()
all_summaries = movie_plot_summaries + book_plot_summaries


In [38]:
# min_df is to ignore vocab that have freq lower than threshhold
# takes words out that appear in more than half the docs
# can think of including bigrams and trigrams
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.9, stop_words='english')

In [43]:
# Learns vocab and idf
# Diff than count which just learns the vocab
# Doc term matrix
vectorizer.fit(all_summaries)



In [45]:
# All of the distinct words
vectorizer.get_feature_names_out()

array(['000', '10', '100', ..., 'young', 'younger', 'youth'], dtype=object)

In [46]:

# 42204 docs and 142364 words
movie_tfidf = vectorizer.transform(movie_plot_summaries)
book_tfidf = vectorizer.transform(book_plot_summaries)




In [47]:

for i in range(5):
    print(movie_tfidf[i])

  (0, 2341)	0.0696120487596576
  (0, 2311)	0.0903885652107767
  (0, 2290)	0.09486792421808116
  (0, 2282)	0.08431957862066244
  (0, 2268)	0.09531488959388194
  (0, 2231)	0.08462566722792332
  (0, 2225)	0.09355465035127226
  (0, 2209)	0.06528986083537672
  (0, 2181)	0.15492182314616834
  (0, 2174)	0.12265484434321151
  (0, 2114)	0.3004966205035224
  (0, 2090)	0.09640639710334847
  (0, 2080)	0.09906901620420523
  (0, 2031)	0.047701711253778256
  (0, 1986)	0.08975370448848796
  (0, 1956)	0.08016234203336856
  (0, 1950)	0.06226395928823302
  (0, 1916)	0.06522108698201694
  (0, 1889)	0.07229029071648208
  (0, 1888)	0.0569642411406077
  (0, 1878)	0.3079996366001819
  (0, 1872)	0.06570406708815718
  (0, 1862)	0.07592231111225933
  (0, 1846)	0.1321942969285566
  (0, 1831)	0.06665359123849679
  :	:
  (0, 588)	0.09372997297156703
  (0, 585)	0.0821458501202221
  (0, 584)	0.07812263894911106
  (0, 565)	0.09943215918238321
  (0, 554)	0.05307090706601857
  (0, 548)	0.05118070364520567
  (0, 505)	0.0

In [48]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [49]:
# each row is a movie and each col is a book
# note: similarity_matrix[i, j] will give the similarity between the i-th movie and the j-th book

similarity_matrix = cosine_similarity(movie_tfidf, book_tfidf)

# loop through each movie
for i in range(similarity_matrix.shape[0]):
    # get the indices of the top five books similar to the i-th movie
    top_books_indices = np.argsort(similarity_matrix[i])[::-1][:5]
    
    # print the results
    print(f"Top 5 similar books for movie {i}:")
    for rank, index in enumerate(top_books_indices):
        print(f"{rank + 1}. Book Index: {index}, Similarity Score: {similarity_matrix[i][index]:.4f}")
    print()  

Book Index: 3644, Similarity Score: 0.26192847836149347
Book Index: 1332, Similarity Score: 0.2616001803715346
Book Index: 11567, Similarity Score: 0.24975270870301755
Book Index: 9194, Similarity Score: 0.24665526681173314
Book Index: 9201, Similarity Score: 0.24416712019303405
