# Hetrec - IMDB Matching

In [2]:
import openai
import json
import pandas as pd
import numpy as np
import h5py
import re
import os

with open("../openai.key", "r") as f:
    os.environ["OPENAI_API_KEY"] = f.readlines()[0]

In [3]:
def combine_columns(row):
    combined = ""
    for name in row.index.values:
        if row[name] == "none" or row[name] == "N/A" or name == "Combined":
            pass
        else:
            combined += f"{row[name]}\n\n"

    return combined

# Hetrec ML-10m

## Collecting Features

### Base

In [159]:
movies_base = pd.read_csv(
    "../data/hetrec/movies.dat", encoding="latin-1", sep="\t"
).rename(columns={"id":"movieID"})

movies_base = movies_base.loc[:, ["movieID", "title", "year"]]
movies_base.head(5)

Unnamed: 0,movieID,title,year
0,1,Toy story,1995
1,2,Jumanji,1995
2,3,Grumpy Old Men,1993
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


### Genres

In [160]:
from functools import partial

def combined_string(row, items):
    return ", ".join(items[row.values.astype(bool)])

genres = pd.read_csv("../data/hetrec/movie_genres.dat", encoding="latin-1", sep="\t")

# Kind of a dumb way to create boolean masks, the movieID in the values will be replaced with 1
genres = genres.pivot(index="movieID", columns="genre", values="movieID")
genres[genres > 0] = 1
genres = genres.replace(np.nan, 0).astype(int)
genres_names = genres.columns.values

genres["Genres"] = genres.apply(partial(combined_string, items=genres_names), axis=1)
genres = genres.reset_index().loc[:, ["movieID", "Genres"]]
genres

genre,movieID,Genres
0,1,"Adventure, Animation, Children, Comedy, Fantasy"
1,2,"Adventure, Children, Fantasy"
2,3,"Comedy, Romance"
3,4,"Comedy, Drama, Romance"
4,5,Comedy
...,...,...
10192,65088,"Adventure, Children, Comedy"
10193,65091,"Crime, Drama, Romance"
10194,65126,"Comedy, Drama"
10195,65130,"Drama, Romance"


### Tags

In [161]:
tags = pd.read_csv("../data/hetrec/tags.dat", encoding="latin-1", sep="\t").rename(columns={"id":"tagID"})
movie_tags = pd.read_csv("../data/hetrec/movie_tags.dat", encoding="latin-1", sep="\t")

movie_tags = movie_tags[movie_tags["tagWeight"] > 1]

movie_tags = movie_tags.merge(tags, "left", on="tagID")
movie_tags = movie_tags.pivot(index="movieID", columns="value", values="tagWeight")
movie_tags = movie_tags.replace(np.nan, 0).astype(int)

tag_names = movie_tags.columns.values
movie_tags["Tags"] = movie_tags.apply(partial(combined_string, items=tag_names), axis=1)

movie_tags = movie_tags.reset_index().loc[:, ["movieID", "Tags"]]
movie_tags

value,movieID,Tags
0,1,"adventure, animated, animation, classic, comed..."
1,2,"animals, board game, fantasy, game, robin will..."
2,3,old
3,5,"childhood classics, family, pregnancy, wedding"
4,6,"cant remember, recommendz top pick, too long"
...,...,...
2958,63113,"007 (series), assassin, murder"
2959,63876,"biography, gay, politics, san francisco"
2960,63992,so bad its good
2961,64983,"assassin, nazis"


### Actors

In [162]:
actors = pd.read_csv("../data/hetrec/movie_actors.dat", encoding="latin-1", sep="\t")

actors = actors.pivot(index="movieID", columns="actorName", values="movieID")
actors[actors > 0] = 1
actors = actors.replace(np.nan, 0).astype(int)
actors = actors.drop(np.NaN, axis=1)

actor_names = actors.columns.values
actors["Actors"] = actors.apply(partial(combined_string, items=actor_names), axis=1)

actors = actors.reset_index().loc[:, ["movieID", "Actors"]]
actors

actorName,movieID,Actors
0,1,"Annie Potts, Bill Farmer, Don Rickles, Erik vo..."
1,2,"Adam Hann-Byrd, Bebe Neuwirth, Bonnie Hunt, Br..."
2,3,"Ann-Margret, Buck Henry, Buffy Sedlachek, Burg..."
3,4,"Angela Bassett, Brandon Hammond, Dennis Haysbe..."
4,5,"Ann Walker, Annie Meyers-Shyer, April Ortiz, B..."
...,...,...
10169,65088,"Abigail Leone Droeger, Adam Sandler, Adam Shan..."
10170,65091,"Al Thompson, Alex Melesh, Bert Russell, Bert S..."
10171,65126,"Anjelica Huston, Bijou Phillips, Brad William ..."
10172,65130,"David Harbour, Dylan Baker, Jay O Sanders, Kat..."


### Directors

In [163]:
directors = pd.read_csv("../data/hetrec/movie_directors.dat", encoding="latin-1", sep="\t")
directors = directors.loc[:, ["movieID", "directorName"]]
directors = directors.rename(columns={"directorName": "DirectorName"})
directors

Unnamed: 0,movieID,DirectorName
0,1,John Lasseter
1,2,Joe Johnston
2,3,Donald Petrie
3,4,Forest Whitaker
4,5,Charles Shyer
...,...,...
10150,65088,Adam Shankman
10151,65091,W.S. Van Dyke
10152,65126,Clark Gregg
10153,65130,Sam Mendes


### Countries

In [164]:
countries = pd.read_csv("../data/hetrec/movie_countries.dat", encoding="latin-1", sep="\t")
countries = countries.rename(columns={"country": "Country"})
countries

Unnamed: 0,movieID,Country
0,1,USA
1,2,USA
2,3,USA
3,4,USA
4,5,USA
...,...,...
10192,65088,USA
10193,65091,USA
10194,65126,USA
10195,65130,USA


### Locations

In [165]:
from collections import defaultdict
locations = pd.read_csv("../data/hetrec/movie_locations.dat", encoding="latin-1", sep="\t")

loc_dict = {k: [] for k in locations["movieID"].values}
for i, row in locations.iterrows():
    location = []
    if (row[1] is not np.nan) and (row[2] is not np.nan) and (row[3] is not np.nan):
        for j in row[1:]:
            if j is not np.nan:
                location.append(j)
                
    location_str = " - ".join(location)
    #print(location_str.strip(" "))
    loc_dict[row[0]].append(location_str)

    
combined_loc_dict = {}        
for k, v in loc_dict.items():
    for i in v:
        if i == "":
            v.remove("")
            
    combined_loc_dict[k] = ", ".join(v)
    
combined_loc_df = pd.DataFrame(
    data=combined_loc_dict.values(),
    index=combined_loc_dict.keys(),
    columns=["FilmingLocations"]
    )

combined_loc_df = combined_loc_df.reset_index().rename(columns={"index": "movieID"})

# Merging with Base

In [628]:
full_movies = movies_base.merge(
    genres, "inner", on="movieID")

full_movies = full_movies.merge(
    movie_tags, "outer", on="movieID").replace({"Tags": np.nan}, "none")

full_movies = full_movies.merge(
    actors, "outer", on="movieID").replace({"Actors": np.nan}, "none")

full_movies = full_movies.merge(
    directors, "outer", on="movieID").replace({"DirectorName": np.nan}, "none")

full_movies = full_movies.merge(
    countries, "outer", on="movieID").replace({"Country": np.nan}, "none")

full_movies = full_movies.merge(
    combined_loc_df, "outer", on="movieID").replace({"FilmingLocations": ""}, "N/A")

full_movies = full_movies.rename(columns={"movieID": "MovieID",
                                          "title": "MovieTitle",
                                          "year": "ReleaseYear"})
full_movies.head(2)

Unnamed: 0,MovieID,MovieTitle,ReleaseYear,Genres,Tags,Actors,DirectorName,Country,FilmingLocations
0,1,Toy story,1995,"Adventure, Animation, Children, Comedy, Fantasy","adventure, animated, animation, classic, comed...","Annie Potts, Bill Farmer, Don Rickles, Erik vo...",John Lasseter,USA,
1,2,Jumanji,1995,"Adventure, Children, Fantasy","animals, board game, fantasy, game, robin will...","Adam Hann-Byrd, Bebe Neuwirth, Bonnie Hunt, Br...",Joe Johnston,USA,"Canada - British Columbia - Delta, Canada - Br..."


# Adding in Wiki Descriptions (Level 3)

In [539]:
wiki_df = pd.read_csv("./hetrec_movie_wiki_descriptions.csv")
wiki_df = wiki_df.rename({"movieID": "MovieID"}, axis=1).drop("pageid", axis=1)

full_movies = full_movies.merge(wiki_df, "outer", on="MovieID").fillna("N/A")

# Minimal Extra Information (Level 1)

In [137]:
full_movies = full_movies.loc[:, ["MovieID", "MovieTitle", "ReleaseYear"]]

# Combining

In [629]:
full_movies["Combined"] = full_movies.apply(combine_columns, axis=1)
print(full_movies["Combined"][0])

Toy story

1995

Adventure, Animation, Children, Comedy, Fantasy

adventure, animated, animation, classic, comedy, computer animation, disney, family, pixar, tim allen, time travel, tom hanks, toys

Annie Potts, Bill Farmer, Don Rickles, Erik von Detten, Greg Berg, Jack Angel, Jan Rabson, Jim Varney, Joan Cusack, Joe Ranft, John Morris, John Ratzenberger, Kendall Cunningham, Laurie Metcalf, Patrick Pinney, Penn Jillette, Philip Proctor, R. Lee Ermey, Sarah Freeman, Scott McAfee, Sherry Lynn, Tim Allen, Tom Hanks, Wallace Shawn

John Lasseter

USA




# Removing Most Common Tokens (Level 4)

In [541]:
combined_column = full_movies["Combined"]
combined_column

0        Toy story\n\n1995\n\nAdventure, Animation, Chi...
1        Jumanji\n\n1995\n\nAdventure, Children, Fantas...
2        Grumpy Old Men\n\n1993\n\nComedy, Romance\n\no...
3        Waiting to Exhale\n\n1995\n\nComedy, Drama, Ro...
4        Father of the Bride Part II\n\n1995\n\nComedy\...
                               ...                        
10192    Bedtime Stories\n\n2008\n\nAdventure, Children...
10193    Manhattan Melodrama\n\n1934\n\nCrime, Drama, R...
10194    Choke\n\n2008\n\nComedy, Drama\n\nAnjelica Hus...
10195    Revolutionary Road\n\n2008\n\nDrama, Romance\n...
10196    Blackadder Back & Forth\n\n1999\n\nComedy\n\nC...
Name: Combined, Length: 10197, dtype: object

In [542]:
from spacy.lang.en import English

tokenizer = English().tokenizer

In [578]:
from collections import Counter

all_count = Counter()

for doc in tokenizer.pipe(combined_column, 100):
    tokens = [
        token.orth_
        for token in doc
        if (token.orth_.isalpha() and not token.is_stop)
    ]
    doc_count = Counter(tokens)
    doc_count = {k: 1 for k in doc_count}
    all_count.update(doc_count)

In [579]:
most_common_dict = dict(all_count.most_common()[:int(len(combined_column)*0.05)])
most_common_dict = {k: v for k, v in all_count.most_common() if v > int(len(combined_column)*0.05)}

In [580]:
list(most_common_dict.items())

[('film', 9536),
 ('Plot', 9323),
 ('directed', 8795),
 ('USA', 7337),
 ('American', 6424),
 ('Drama', 5088),
 ('written', 4680),
 ('John', 4632),
 ('time', 3926),
 ('tells', 3851),
 ('starring', 3786),
 ('released', 3778),
 ('Comedy', 3609),
 ('home', 3559),
 ('stars', 3524),
 ('later', 3494),
 ('life', 3491),
 ('California', 3475),
 ('based', 3472),
 ('Michael', 3458),
 ('finds', 3316),
 ('comedy', 3289),
 ('takes', 3248),
 ('story', 3240),
 ('find', 3214),
 ('David', 3192),
 ('drama', 3061),
 ('New', 3029),
 ('Robert', 2946),
 ('man', 2923),
 ('night', 2908),
 ('away', 2883),
 ('help', 2864),
 ('James', 2858),
 ('day', 2836),
 ('new', 2822),
 ('father', 2782),
 ('United', 2762),
 ('young', 2709),
 ('City', 2707),
 ('received', 2674),
 ('goes', 2672),
 ('death', 2667),
 ('family', 2644),
 ('leaves', 2588),
 ('friend', 2561),
 ('Los', 2559),
 ('returns', 2532),
 ('Angeles', 2521),
 ('years', 2510),
 ('Film', 2495),
 ('tries', 2481),
 ('wife', 2478),
 ('Richard', 2478),
 ('Studios', 24

In [608]:
new_combined = []
for doc in tokenizer.pipe(combined_column, 150):
    tokens = [token.orth_ for token in doc]
    for token in tokens:
        try:
            most_common_dict[token]
            tokens.remove(token)
        except KeyError as e:
            pass
    
    new_string = " ".join(tokens)
    new_string = new_string.replace(",", "")
    new_string = new_string.replace("  ", " ")
    new_combined.append(new_string)

In [612]:
full_movies["Combined"] = pd.Series(new_combined)

# Auxiliary Dataset

In [4]:
full_movies = pd.read_csv("../data/auxiliary_dataset_wiki.csv")
full_movies = full_movies.rename({"title": "MovieTitle", "genres": "Genres", "year": "ReleaseYear"}, axis=1)
full_movies = full_movies.fillna("")

In [6]:
full_movies["Combined"] = full_movies.apply(combine_columns, axis=1)

# Getting GPT-3.5 Embeddings

In [7]:
import tiktoken

embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
MAX_TOKEN = 8000

encoding = tiktoken.get_encoding(embedding_encoding)

In [13]:
n_tokens = full_movies["Combined"].apply(lambda x: len(encoding.encode(x)))
n_tokens.describe()

count      70.000000
mean     1049.614286
std       434.206699
min        15.000000
25%       899.000000
50%      1158.500000
75%      1323.000000
max      1786.000000
Name: Combined, dtype: float64

In [14]:
(n_tokens.sum()/1000) * 0.0004

0.0293892

In [9]:
from openai.embeddings_utils import get_embedding

embedding = full_movies["Combined"].apply(lambda x: get_embedding(x, engine=embedding_model))

In [10]:
embedding

0      [-0.023528121411800385, -0.01260435115545988, ...
1      [0.00486621493473649, -0.00323661370202899, -0...
2      [-0.026798883453011513, -0.03953718766570091, ...
3      [-0.01732458919286728, -0.045089658349752426, ...
4      [-0.0016022056806832552, 0.002192316809669137,...
                             ...                        
328    [-0.005953111220151186, -0.007994270883500576,...
329    [0.009177283383905888, -0.002706709550693631, ...
330    [0.00604227464646101, -0.0031645353883504868, ...
331    [0.005159646272659302, -0.03872064873576164, -...
332    [-0.009587714448571205, 0.0004927201080136001,...
Name: Combined, Length: 333, dtype: object

In [19]:
import pickle

with open("../data/pickle_jar/aux_gpt3_embed.pickle", "wb") as f:
    pickle.dump(embedding, f, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
embeddings_new = []
for e in embedding.values:
    embeddings_new.append(e)
    
embeddings_new = np.array(embeddings_new, dtype="float64")

In [12]:
import h5py

with h5py.File("../data/embeddings/aux_gpt3_embed_level3.h5", "w") as f:
    for column in full_movies.columns.values:
        f.create_dataset(column, data=full_movies[column].values)
    
    f.create_dataset("Embedding", data=embeddings_new)

In [13]:
def h5py_filetree(path):
    with h5py.File(path, "r") as temp:
        for k in temp.keys():
            print(k)

h5py_filetree("../data/embeddings/aux_gpt3_embed_level3.h5")

Combined
Description
Embedding
Genres
MovieTitle
ReleaseYear


In [128]:
with h5py.File("./hetrec_gpt3_embed_level3.h5", "r") as f:
    embeds = f["Embedding"][:]
    movie_ids = f["MovieID"][:]
    titles = f["MovieTitle"][:]
    tags = f["Tags"][:]
    genres = f["Genres"][:]