In [14]:
import pandas as pd
import ast
import itertools #https://stackoverflow.com/questions/952914/how-do-i-make-a-flat-list-out-of-a-list-of-lists

def extract_genres(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        return list(genres_dict.values())
    except:
        return []
    
def extract_genres_omit_empty(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        return list(genres_dict.values())
    except:
        return None
    
def extract_genres_normalized(genre_str):
    try:
        genres_dict = ast.literal_eval(genre_str)
        input = list(genres_dict.values())
        out = []
        for i in input: #Remove genres not listed in master genre list
            if i in common_genre_set:
                out.append(i)
        return out if len(out) > 0 else None
    except:
        return None

#GENRE NORMALIZATION
#Get all genres from books
raw_book_genres = []
with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            raw_book_genres.append(extract_genres(fields[5]))

# print(raw_book_genres)
list_of_all_book_genres = list(itertools.chain(*raw_book_genres))
set_of_book_genres = set(list_of_all_book_genres)

#Getting movie genres
metadata_df = pd.read_csv(
    './MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres)
raw_movie_genres = list(metadata_df['Genres'])
list_of_movie_genres = list(itertools.chain(*raw_movie_genres))
set_of_movie_genres = set(list_of_movie_genres)

common_genre_set = set_of_movie_genres & set_of_book_genres
print(common_genre_set)
print(len(common_genre_set))



#Previous code-----------
book_data = []

with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            book_entry = {
                'Book Title': fields[2],
                'Genre': extract_genres_omit_empty(fields[5]),
                'Plot Summary': fields[6]
            }
            book_data.append(book_entry)

books_df = pd.DataFrame(book_data)
books_df = books_df.dropna(subset=['Genre'], inplace=False)
books_df.head()



{'Detective fiction', 'Foreign legion', 'Biography', 'Alien invasion', 'Education', 'Anthology', 'Whodunit', 'Coming of age', 'Albino bias', 'School story', 'Western', 'Steampunk', 'Sports', 'Pornography', 'Fantasy', 'Tragicomedy', 'Adventure', 'Gay Themed', 'Horror', 'Wuxia', 'Black comedy', 'Travel', 'History', 'Thriller', 'Anthropology', 'Fairy tale', 'Romantic comedy', 'Nature', 'Existentialism', 'Comedy of manners', 'Parody', 'Space opera', 'Historical fiction', 'Erotica', 'Comedy', 'Crime Fiction', 'Drama', 'Mystery', 'Conspiracy fiction', 'Humour', 'Suspense', 'Satire', 'Supernatural', 'Apocalyptic and post-apocalyptic fiction', 'Dystopia', 'Space western', 'Cyberpunk', 'Sword and sorcery', 'Time travel', 'Anti-war', 'Business', 'Science Fiction', 'Music'}
53


Unnamed: 0,Book Title,Genre,Plot Summary
0,Animal Farm,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,The Plague,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...
4,A Fire Upon the Deep,"[Hard science fiction, Science Fiction, Specul...",The novel posits that space around the Milky ...
5,All Quiet on the Western Front,"[War novel, Roman à clef]","The book tells the story of Paul Bäumer, a Ge..."


In [15]:
metadata_df = pd.read_csv(
    './MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres_omit_empty)
merged = pd.merge(metadata_df, plot_df, on='ID')

movies_df = merged[['Movie Name', 'Genres', 'Plot Summary']]
movies_df.columns = ['Movie Name', 'Genre List', 'Plot Summary']
movies_df = movies_df.dropna(subset=['Genre List'], inplace=False)

movies_df.head()

Unnamed: 0,Movie Name,Genre List,Plot Summary
0,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,White Of The Eye,"[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...
2,A Woman in Flames,[Drama],"Eva, an upper class housewife, becomes frustra..."
3,The Sorcerer's Apprentice,"[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns..."
4,Little city,"[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a..."


In [16]:
books_df.to_csv('book_data.txt', sep=' ', index=False, header=False)
movies_df.to_csv('movie_data.txt', sep=' ', index=False, header=False)

In [17]:
#Above but using normalized/common genre set only

#Books
book_data = []

with open('./booksummaries/booksummaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.split('\t')
        if len(fields) >= 7:  # ensure line has all required fields
            book_entry = {
                'Book Title': fields[2],
                'Genre': extract_genres_normalized(fields[5]),
                'Plot Summary': fields[6]
            }
            book_data.append(book_entry)

books_normalized_genre_df = pd.DataFrame(book_data)
books_normalized_genre_df = books_normalized_genre_df.dropna(subset=['Genre'], inplace=False)

books_normalized_genre_df.head()


Unnamed: 0,Book Title,Genre,Plot Summary
0,Animal Farm,[Satire],"Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"[Science Fiction, Satire]","Alex, a teenager living in near-future Englan..."
2,The Plague,[Existentialism],The text of The Plague is divided into five p...
4,A Fire Upon the Deep,"[Science Fiction, Fantasy]",The novel posits that space around the Milky ...
6,A Wizard of Earthsea,[Fantasy],"Ged is a young boy on Gont, one of the larger..."


In [18]:
#Movies
metadata_df = pd.read_csv(
    './MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    header=None, 
    names=['ID', 'Freebase ID', 'Movie Name', 'Release Date', 'Revenue', 'Runtime', 'Language', 'Country', 'Genres'],
    usecols=['ID', 'Movie Name', 'Genres'] ) 

plot_df = pd.read_csv(
    './MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    header=None, 
    names=['ID', 'Plot Summary']
)

metadata_df['Genres'] = metadata_df['Genres'].apply(extract_genres_normalized)
merged = pd.merge(metadata_df, plot_df, on='ID')

movies_normalized_genre_df = merged[['Movie Name', 'Genres', 'Plot Summary']]
movies_normalized_genre_df.columns = ['Movie Name', 'Genre List', 'Plot Summary']
#https://saturncloud.io/blog/how-to-delete-rows-with-null-values-in-a-specific-column-in-pandas-dataframe/
movies_normalized_genre_df = movies_normalized_genre_df.dropna(subset=['Genre List'], inplace=False)

movies_normalized_genre_df.head()

Unnamed: 0,Movie Name,Genre List,Plot Summary
0,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,White Of The Eye,[Thriller],A series of murders of rich young women throug...
2,A Woman in Flames,[Drama],"Eva, an upper class housewife, becomes frustra..."
3,The Sorcerer's Apprentice,"[Fantasy, Adventure]","Every hundred years, the evil Morgana returns..."
4,Little city,"[Romantic comedy, Drama, Comedy]","Adam, a San Francisco-based artist who works a..."


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
# 
movie_plot_summaries = movies_normalized_genre_df['Plot Summary'].tolist()
book_plot_summaries = books_normalized_genre_df['Plot Summary'].tolist()
all_summaries = movie_plot_summaries + book_plot_summaries


In [21]:
# min_df is to ignore vocab that have freq lower than threshhold
# takes words out that appear in more than half the docs
# can think of including bigrams and trigrams
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.9, stop_words='english')

In [22]:
# Learns vocab and idf
# Diff than count which just learns the vocab
# Doc term matrix
vectorizer.fit(all_summaries)



In [23]:
# All of the distinct words
vectorizer.get_feature_names_out()

array(['000', '10', '100', ..., 'young', 'younger', 'youth'], dtype=object)

In [24]:

# 42204 docs and 142364 words
movie_tfidf = vectorizer.transform(movie_plot_summaries)
book_tfidf = vectorizer.transform(book_plot_summaries)

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [26]:
from transformers import MobileBertTokenizer, MobileBertModel
import torch
#Use Mobile BERT for resource-constrained environments.
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model = MobileBertModel.from_pretrained('google/mobilebert-uncased')




In [27]:
def get_mobilebert_embedding(text):
    model.eval()  
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=256, padding=True)
    with torch.no_grad():
        output = model(**encoded_input)
    embeddings = output.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()  # Ensure to move tensor back to CPU if you're using a GPU

In [28]:
def batch_process_embeddings(texts, batch_size=32):
    #Process texts in batches to avoid memory issues.
    batch_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        embeddings = np.array([get_mobilebert_embedding(text) for text in batch_texts])
        batch_embeddings.append(embeddings)
    return np.vstack(batch_embeddings) 

In [29]:
movie_embeddings = batch_process_embeddings(movies_normalized_genre_df['Plot Summary'].tolist())
book_embeddings = batch_process_embeddings(books_normalized_genre_df['Plot Summary'].tolist())

In [30]:
# Calculate cosine similarity between movie and book embeddings
similarity_matrix_bert = cosine_similarity(movie_embeddings, book_embeddings)

In [31]:
def compute_jaccard(movie_genres, book_genres):
    # Convert lists to sets if not already
    set_movie_genres = set(movie_genres)
    set_book_genres = set(book_genres)
    intersection = set_movie_genres.intersection(set_book_genres)
    union = set_movie_genres.union(set_book_genres)
    if not union:
        return 0
    return len(intersection) / len(union)

In [32]:
# each row is a movie and each col is a book
# note: similarity_matrix[i, j] will give the similarity between the i-th movie and the j-th book

similarity_matrix = cosine_similarity(movie_tfidf, book_tfidf)

all_average_jaccards = []
zeros = 0

with open('scoring.txt', 'a') as file:
    # loop through each movie
    for i in range(similarity_matrix.shape[0]):
        movie_genres = movies_normalized_genre_df.iloc[i]['Genre List']
        top_books_indices = np.argsort(similarity_matrix[i])[::-1][:10]
        
        jaccard_scores = []
        
        # titles and Jaccard scores
        for index in top_books_indices:
            book_genres = books_normalized_genre_df.iloc[index]['Genre']
            jaccard_score = compute_jaccard(movie_genres, book_genres)
            jaccard_scores.append(jaccard_score)
        
        # average for this book of top 5
        average_jaccard = sum(jaccard_scores) / len(jaccard_scores)
        if average_jaccard == 0:
            zeros+=1
        all_average_jaccards.append(average_jaccard) 

        
        # write to file
        file.write(f"{movies_normalized_genre_df.iloc[i]['Movie Name']}\tAverage Jaccard: {average_jaccard:.4f}\n")

        file.write("\n")
        
overall_average_jaccard = sum(all_average_jaccards) / len(all_average_jaccards) if all_average_jaccards else 0
print(f"Overall Average Jaccard Score: {overall_average_jaccard:.4f}")
total = similarity_matrix.shape[0]
percentage_zero =  zeros / total
print(f"Percentage Zero: {percentage_zero}")


: 