In [1]:
import os
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings('ignore')

AWS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_S3_BUCKET = os.getenv('AWS_S3_BUCKET')
REVIEWS_FILE_TEMPLATE = 'reviews/reviews_partition_{}.parquet'

storage_options = {
    'key': AWS_KEY,
    'secret': AWS_SECRET_KEY
}

reviews_files = [
    os.path.join(
        's3://',
        AWS_S3_BUCKET,
        REVIEWS_FILE_TEMPLATE.format(partition + 1)
    )
    for partition in range(20)
]
metadata_file = os.path.join('s3://', AWS_S3_BUCKET, 'metadata/metadata.json')
metadata_proc_file = os.path.join('s3://', AWS_S3_BUCKET, 'metadata/metadata_features.json')

%load_ext autoreload
%autoreload 2

In [2]:
metadata = pd.read_json(metadata_proc_file, storage_options=storage_options, orient='index')
metadata = metadata.astype({'release_date': 'datetime64[D]'})
metadata.head()

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0002130/,Adventure,Dante's Inferno,https://m.media-amazon.com/images/M/MV5BNjU1ND...,"{'1': '/name/nm0660139', '2': '/name/nm0685283...","{'1': '/title/tt0003740/', '2': '/title/tt0001...",1,7.0,2900,36.0,13.0,...,Italian,Milano Film,SAFFI-Comerio,,"Bovisa, Milano, Lombardia, Italy",Italy,,,,
/title/tt0003740/,Adventure,Cabiria,https://m.media-amazon.com/images/M/MV5BZmVjNz...,"{'1': '/name/nm0021935', '2': '/name/nm0702894...","{'1': '/title/tt0006864/', '2': '/title/tt0009...",1,7.1,3600,35.0,26.0,...,Italian,Itala Film,,,"FERT Studios, Turin, Piedmont, Italy",Italy,"ITL 1,000,000 (estimated)",,,
/title/tt0004635/,Action,The Squaw Man,https://m.media-amazon.com/images/M/MV5BMjMwOD...,"{'1': '/name/nm0267914', '2': '/name/nm0758457...","{'1': '/title/tt0014532/', '2': '/title/tt0006...",1,5.7,979,9.0,4.0,...,,Jesse L. Lasky Feature Play Company,,,Hollywood Heritage Museum - 2100 North Highlan...,USA,"$20,000 (estimated)",,,
/title/tt0004707/,Comedy,Tillie's Punctured Romance,https://m.media-amazon.com/images/M/MV5BMDZjOW...,"{'1': '/name/nm0000122', '2': '/name/nm0237597...","{'1': '/title/tt0005074/', '2': '/title/tt0006...",1,6.3,3500,41.0,19.0,...,,Keystone Film Company,,,"Sans Souci Castle, Los Angeles, California, USA",USA,"$50,000 (estimated)",,,
/title/tt0004972/,Drama,The Birth of a Nation,https://m.media-amazon.com/images/M/MV5BNWZlNj...,"{'1': '/name/nm0001273', '2': '/name/nm0550615...","{'1': '/title/tt0006864/', '2': '/title/tt0009...",1,6.2,25000,379.0,79.0,...,,David W. Griffith Corp.,Epoch Producing Corporation,,"Calexico, California, USA",USA,"$110,000 (estimated)",,,


In [4]:
from tqdm.notebook import tqdm

reviews_partitions = []
for path in tqdm(reviews_files):
    reviews_partition = pd.read_parquet(path, storage_options=storage_options)
    reviews_partitions.append(reviews_partition)
    break
reviews = pd.concat(reviews_partitions)

  0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
user_item_matrix = (
    reviews
    [['id', 'author', 'rating']]
    .drop_duplicates()
    .pivot(index='author', columns='id', values='rating')
)
memusage = user_item_matrix.memory_usage(deep=True).sum() / 1024 / 1024
print(f'{memusage:.2f} Mb')

601.00 Mb


In [10]:
reviews

Unnamed: 0,index,id,text,rating,title,author,upvotes,total_votes,review_date
0,0,/title/tt0113142/,I liked this movie the best out of all the '90...,9.0,A modern day classic kaijyu movie,/user/ur3225430/,24,26,2005-10-10
1,1,/title/tt0113142/,When Gamera first appeared in Japanese theater...,10.0,a magnificently entertaining monster movie; t...,/user/ur6321000/,13,13,2009-11-23
2,2,/title/tt0113142/,"Sooner than I expected, GAMERA Trilogy COMPLET...",9.0,Didn't Disappoint !!,/user/ur20916867/,10,10,2009-04-09
3,3,/title/tt0113142/,This film is the first of the newer Gamera mon...,8.0,Good Introduction to Revive Gamera Trilogy.,/user/ur0437174/,6,6,2007-06-13
4,4,/title/tt0113142/,"This is a good, definitely 90ish monster movie...",8.0,good 90ish monster movie,/user/ur5156469/,7,7,2005-12-04
...,...,...,...,...,...,...,...,...,...
204813,53,/title/tt0118691/,The Nanny is a great sitcom and I love it. Bu...,,The Nanny goes to Eastern Europe,/user/ur0398112/,0,2,1999-07-28
204814,54,/title/tt0118691/,Go into this movie for what it is and it's hum...,10.0,Forgot how funny this movie actually was,/user/ur119571184/,0,1,2020-06-10
204815,55,/title/tt0118691/,"Seen some poor reviews, so what, I love it to ...",10.0,Love it,/user/ur136147672/,0,0,2021-10-23
204816,56,/title/tt0118691/,Drescher has an excellent performance!!! And D...,8.0,Very kind movie!!!,/user/ur19378453/,0,0,2022-01-29


In [11]:
num_users, num_titles = user_item_matrix.shape
num_total_elements = num_users * num_titles
num_nonempty_elements = (~user_item_matrix.isna()).sum().sum()
sparsity = 100 * (num_total_elements - num_nonempty_elements) / num_total_elements
print(sparsity)

99.88075755787395


In [None]:
memusage_f32 = user_item_matrix.memory_usage(deep=True).sum() / 1024 / 1024
memusage_f16 = user_item_matrix.astype('float16').memory_usage(deep=True).sum() / 1024 / 1024
print(
    f'Float32 memory footprint: {memusage_f32:.1f} Mb',
    f'Float16 memory footprint: {memusage_f16:.1f} Mb',
    sep='\n'
)

In [23]:
vals = metadata.nlargest(20, 'rating').index
relevant_metadata = metadata[metadata.index.isin(vals)]
relevant_metadata

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0050083/,Crime,12 Angry Men,https://m.media-amazon.com/images/M/MV5BMWU4N2...,"{'1': '/name/nm0000020', '2': '/name/nm0002011...","{'1': '/title/tt0108052/', '2': '/title/tt0111...",1,9.0,771000,1900.0,152.0,...,English,Orion-Nova Productions,,,"New York County Courthouse - 60 Centre Street,...",USA,"$350,000 (estimated)",,,$955
/title/tt0068646/,Crime,The Godfather,https://m.media-amazon.com/images/M/MV5BM2MyNj...,"{'1': '/name/nm0000008', '2': '/name/nm0000199...","{'1': '/title/tt0071562/', '2': '/title/tt0111...",1,9.2,1800000,5000.0,197.0,...,English,Paramount Pictures,Albert S. Ruddy Productions,Alfran Productions,"Forza d'Agrò, Messina, Sicily, Italy",Italy,"$6,000,000 (estimated)","$136,381,073","$302,393","$250,341,816"
/title/tt0071562/,Crime,The Godfather: Part II,https://m.media-amazon.com/images/M/MV5BMWMwMG...,"{'1': '/name/nm0000199', '2': '/name/nm0000134...","{'1': '/title/tt0068646/', '2': '/title/tt0099...",1,9.0,1200000,1200.0,187.0,...,English,Paramount Pictures,The Coppola Company,American Zoetrope,"Kaiser Estate, 4000 W Lake Blvd, Homewood, Lak...",USA,"$13,000,000 (estimated)","$47,834,595","$171,417","$47,961,919"
/title/tt0108052/,Biography,Schindler's List,https://m.media-amazon.com/images/M/MV5BNDE4OT...,"{'1': '/name/nm0000553', '2': '/name/nm0000146...","{'1': '/title/tt0110912/', '2': '/title/tt0111...",1,9.0,1300000,2100.0,173.0,...,English,Universal Pictures,Amblin Entertainment,,"Auschwitz-Birkenau Concentration Camp, Oswieci...",Poland,"$22,000,000 (estimated)","$96,898,818","$656,636","$322,161,245"
/title/tt0111161/,Drama,The Shawshank Redemption,https://m.media-amazon.com/images/M/MV5BMDFkYT...,"{'1': '/name/nm0000209', '2': '/name/nm0000151...","{'1': '/title/tt0468569/', '2': '/title/tt0109...",1,9.3,2600000,10200.0,164.0,...,English,Castle Rock Entertainment,,,"127A Smithfield Road, Frederiksted, Virgin Isl...",Islands,"$25,000,000 (estimated)","$28,767,189","$727,327","$28,884,504"
/title/tt0167260/,Action,The Lord of the Rings: The Return of the King,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,"{'1': '/name/nm0000704', '2': '/name/nm0001557...","{'1': '/title/tt0167261/', '2': '/title/tt0120...",1,9.0,1800000,4000.0,342.0,...,English,New Line Cinema,WingNut Films,The Saul Zaentz Company,"Hinuera Valley, Matamata, Waikato, New Zealand",Zealand,"$94,000,000 (estimated)","$378,251,207","$72,629,713","$1,146,436,214"
/title/tt0232152/,Drama,Nagara Haavu,https://m.media-amazon.com/images/M/MV5BMTEyYW...,"{'1': '/name/nm0024259', '2': '/name/nm0033175...","{'1': '/title/tt1339248/', '2': '/title/tt0315...",1,9.0,1100,10.0,1.0,...,Kannada,Sri Eswari Productions,,,,,,,,
/title/tt0249795/,Comedy,Mayabazar,https://m.media-amazon.com/images/M/MV5BMmQwNj...,"{'1': '/name/nm0004417', '2': '/name/nm0710036...","{'1': '/title/tt0311594/', '2': '/title/tt0249...",1,9.1,5000,34.0,1.0,...,Telugu,Vijaya Pictures,,,"Vauhini Studios, Chennai, Tamil Nadu, India",India,,,,
/title/tt0252487/,Comedy,The Chaos Class,https://m.media-amazon.com/images/M/MV5BOWI4NG...,"{'1': '/name/nm0839017', '2': '/name/nm0654805...","{'1': '/title/tt0252488/', '2': '/title/tt0252...",1,9.3,40000,72.0,1.0,...,Turkish,Arzu Film,,,"Camlica, Istanbul, Turkey",Turkey,,,,
/title/tt0259534/,Animation,Ramayana: The Legend of Prince Rama,https://m.media-amazon.com/images/M/MV5BOTk4NG...,"{'1': '/name/nm4468244', '2': '/name/nm1336047...","{'1': '/title/tt0488836/', '2': '/title/tt1674...",1,9.1,4800,29.0,,...,Hindi,Nippon Ramayana Film Co.,TEM,,,,,,,


In [34]:
relevant_metadata[['genre_1', 'genre_2', 'genre_3']].apply(lambda x: ', '.join(x) if x else '', axis=1)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [24]:
relevant_metadata.columns

Index(['main_genre', 'original_title', 'poster_url', 'actors',
       'imdb_recommendations', 'reviews_collected_flg', 'rating', 'num_votes',
       'user_review_num', 'critic_review_num', 'metascore', 'genre_1',
       'genre_2', 'genre_3', 'release_date', 'also_known_as', 'runtime',
       'country_of_origin_1', 'country_of_origin_2', 'country_of_origin_3',
       'original_language', 'production_company_1', 'production_company_2',
       'production_company_3', 'filming_location', 'filming_country', 'budget',
       'boxoffice_gross_domestic', 'boxoffice_gross_opening',
       'boxoffice_gross_worldwide'],
      dtype='object')

In [43]:
metadata['release_date'].dt.strftime('%B %Y')

/title/tt0002130/        July 1911
/title/tt0003740/         May 1914
/title/tt0004635/    February 1914
/title/tt0004707/    December 1914
/title/tt0004972/       March 1915
                         ...      
/title/tt9898858/       April 2020
/title/tt9900782/     October 2019
/title/tt9907782/    February 2022
/title/tt9911196/    February 2020
/title/tt9916362/     October 2020
Name: release_date, Length: 23583, dtype: object

In [36]:
metadata['genre_1']

/title/tt0002130/    Adventure
/title/tt0003740/    Adventure
/title/tt0004635/       Action
/title/tt0004707/       Comedy
/title/tt0004972/        Drama
                       ...    
/title/tt9898858/       Action
/title/tt9900782/       Action
/title/tt9907782/      Fantasy
/title/tt9911196/       Comedy
/title/tt9916362/        Drama
Name: genre_1, Length: 23583, dtype: object

In [None]:
relevant_metadata[['original_title', 'poster_url', 'rating', '']]

In [204]:
from abc import ABCMeta
from typing import List, Dict, Any
from itertools import chain


class BaseRecommender(metaclass=ABCMeta):
    RELEVANT_METADATA = [
        'original_title',
        'genre',
        'release_date',
        'country_of_origin_1',
        'production_company_1',
        'rating',
        'poster_url'
    ]
    def __init__(self, metadata: pd.DataFrame, **model_params):
        self.metadata = metadata

    def _recommend(self, **user_input) -> Dict[str, float]:
        pass

    def _format_recommendations(
        self,
        identifiers: Dict[str, float]
    ) -> Dict[str, Any]:
        """
        Returns relevant metadata for each predicted movie.
        """
        metadata_ = self.metadata[self.metadata.index.isin(identifiers)]
        metadata_.loc[identifiers.keys(), 'relevance'] = list(identifiers.values())
        metadata_['genre'] = (
            metadata_[['genre_1', 'genre_2', 'genre_3']]
            .apply(lambda x: ', '.join(x.dropna()), axis=1)
        )
        metadata_['release_date'] = (
            metadata_['release_date']
            .dt.strftime('%B %Y')
        )
        metadata_['rating'] = metadata_['rating'].round(1)

        return (
            metadata_
            .sort_values('relevance', ascending=False)
            [self.RELEVANT_METADATA]
            .to_dict(orient='records')
        )
    
    def recommend(
        self,
        user_liked_movies: List[str] = None,
        user_preferences: Dict[str, Any] = None
    ) -> Dict[str, Any]:
        user_input = {
            'user_liked_movies': user_liked_movies,
            'user_preferences': user_preferences
        }
        raw_recommendations = self._recommend(**user_input)
        return self._format_recommendations(raw_recommendations)


class PopularRecommender(BaseRecommender):
    def __init__(self, metadata: pd.DataFrame, top_n: int):
        BaseRecommender.__init__(self, metadata)
        self.popular_titles = (
            self.metadata
            .query('num_votes > 50000')
            .nlargest(top_n, 'rating')
        )

    def _recommend(self, **user_input) -> Dict[str, float]:
        return self.popular_titles['rating'].to_dict()


class IMDBRecommender(BaseRecommender):
    def __init__(self, metadata: pd.DataFrame, top_n: int):
        BaseRecommender.__init__(self, metadata)
        self.metadata['imdb_recommendations_set'] = (
            self.metadata['imdb_recommendations']
            .apply(lambda x: set(x.values()))
        )
        self.top_n = top_n
        
    def _recommend(self, user_liked_movies: List[str], **user_input)\
            -> List[str]:
        # metadata about movies that user liked
        user_liked_movies_metadata = self.metadata[
            self.metadata.index.isin(user_liked_movies)
        ]
        # set of recommendations for each liked movie
        all_imdb_recommendations = (
            user_liked_movies_metadata['imdb_recommendations_set']
            .tolist()
        )
        # metadata for unique recommendations for all liked movies
        # recommend only movies with highest rating
        unique_recommendations_metadata = self.metadata[
            self.metadata.index.isin(set(chain(*all_imdb_recommendations)))
        ]
        return (
            unique_recommendations_metadata
            .nlargest(self.top_n, 'rating')
            ['rating']
            .to_dict()
        )

In [206]:
liked_movies = np.random.choice(metadata.index, 5).tolist()

In [209]:
%%time
poprec = PopularRecommender(metadata, 10)
poprec.recommend()[:2]

CPU times: user 15 ms, sys: 674 µs, total: 15.7 ms
Wall time: 15.1 ms


[{'original_title': 'The Shawshank Redemption',
  'genre': 'Drama',
  'release_date': 'October 1994',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Castle Rock Entertainment',
  'rating': 9.3,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_QL75_UX190_CR0,0,190,281_.jpg'},
 {'original_title': 'The Godfather',
  'genre': 'Crime, Drama',
  'release_date': 'March 1972',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Paramount Pictures',
  'rating': 9.2,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BM2MyNjYxNmUtYTAwNi00MTYxLWJmNWYtYzZlODY3ZTk3OTFlXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_QL75_UY281_CR4,0,190,281_.jpg'}]

In [208]:
%%time
imdbrec = IMDBRecommender(metadata, 10)
imdbrec.recommend(liked_movies)[:2]

CPU times: user 52.1 ms, sys: 1.17 ms, total: 53.2 ms
Wall time: 52.5 ms


[{'original_title': 'Rocky',
  'genre': 'Drama, Sport',
  'release_date': 'December 1976',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Chartoff-Winkler Productions',
  'rating': 8.1,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BNTBkMjg2MjYtYTZjOS00ODQ0LTg0MDEtM2FiNmJmOGU1NGEwXkEyXkFqcGdeQXVyMjUzOTY1NTc@._V1_QL75_UX190_CR0,4,190,281_.jpg'},
 {'original_title': 'First Blood',
  'genre': 'Action, Adventure, Thriller',
  'release_date': 'October 1982',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Anabasis N.V.',
  'rating': 7.7,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BODBmOWU2YWMtZGUzZi00YzRhLWJjNDAtYTUwNWVkNDcyZmU5XkEyXkFqcGdeQXVyNDk3NzU2MTQ@._V1_QL75_UX190_CR0,2,190,281_.jpg'}]

In [211]:
metadata.columns

Index(['main_genre', 'original_title', 'poster_url', 'actors',
       'imdb_recommendations', 'reviews_collected_flg', 'rating', 'num_votes',
       'user_review_num', 'critic_review_num', 'metascore', 'genre_1',
       'genre_2', 'genre_3', 'release_date', 'also_known_as', 'runtime',
       'country_of_origin_1', 'country_of_origin_2', 'country_of_origin_3',
       'original_language', 'production_company_1', 'production_company_2',
       'production_company_3', 'filming_location', 'filming_country', 'budget',
       'boxoffice_gross_domestic', 'boxoffice_gross_opening',
       'boxoffice_gross_worldwide', 'imdb_recommendations_set'],
      dtype='object')

In [218]:
metadata.groupby('original_language')['main_genre'].count().sort_values(ascending=False)[:10]

original_language
English     15589
French       1122
Hindi        1066
Japanese      633
Italian       515
Spanish       500
Turkish       452
German        368
Tamil         268
Korean        244
Name: main_genre, dtype: int64

In [212]:
metadata['genre_1']

/title/tt0002130/    Adventure
/title/tt0003740/    Adventure
/title/tt0004635/       Action
/title/tt0004707/       Comedy
/title/tt0004972/        Drama
                       ...    
/title/tt9898858/       Action
/title/tt9900782/       Action
/title/tt9907782/      Fantasy
/title/tt9911196/       Comedy
/title/tt9916362/        Drama
Name: genre_1, Length: 23583, dtype: object

In [3]:
top_n_actors = 5
actors = [
    (list(item.values()) + ['']*(top_n_actors - len(item)))[:top_n_actors]
    for item in metadata['actors']
]
actors_df = pd.DataFrame.from_records(actors)

In [5]:
actors_df

Unnamed: 0,0,1,2,3,4
0,/name/nm0660139,/name/nm0685283,/name/nm0209738,/name/nm3942815,/name/nm1375863
1,/name/nm0021935,/name/nm0702894,/name/nm0656034,/name/nm0146028,/name/nm0544842
2,/name/nm0267914,/name/nm0758457,/name/nm0455612,/name/nm0277317,/name/nm0298243
3,/name/nm0000122,/name/nm0237597,/name/nm0635667,/name/nm0841501,/name/nm0071658
4,/name/nm0001273,/name/nm0550615,/name/nm0910400,/name/nm0178270,/name/nm0017488
...,...,...,...,...,...
23578,/name/nm1159180,/name/nm0378245,/name/nm10067359,/name/nm2365811,/name/nm0498165
23579,/name/nm1912683,/name/nm1230844,/name/nm10183124,/name/nm6998719,/name/nm6649988
23580,/name/nm2933542,/name/nm0717709,/name/nm0677944,/name/nm3646923,/name/nm0079451
23581,/name/nm0277932,/name/nm0824373,/name/nm3362584,/name/nm2558112,/name/nm10877188


In [7]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
main_actors_categories = 20
actor_encoder = OneHotEncoder(max_categories=main_actors_categories)
actor_encoder.fit(actors_df[0].to_frame())

In [56]:
def get_actor_categories(
    actors: pd.Series,
    max_categories: int,
    prefix: str = ''
) -> pd.DataFrame:
    actor_encoder = OneHotEncoder(max_categories=max_categories)
    actor_encoder.fit(actors.to_frame())

    actor_encoded = pd.DataFrame(
        data=actor_encoder.transform(actors.to_frame()).todense(),
        columns=actor_encoder.get_feature_names(),
        dtype=bool
    )
    actor_encoded.columns = actor_encoded.columns.str.removeprefix('x0_')
    return actor_encoded.add_prefix(prefix)

get_actor_categories(actors_df[0], 20, 'first_')

Unnamed: 0,first_/name/nm0000044,first_/name/nm0000078,first_/name/nm0000115,first_/name/nm0000134,first_/name/nm0000142,first_/name/nm0000158,first_/name/nm0000230,first_/name/nm0000243,first_/name/nm0000246,first_/name/nm0000323,first_/name/nm0000329,first_/name/nm0000821,first_/name/nm0001191,first_/name/nm0006795,first_/name/nm0222426,first_/name/nm0451321,first_/name/nm0474774,first_/name/nm0482320,first_/name/nm0839017,first_infrequent_sklearn
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
23579,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
23580,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
23581,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [43]:
actor_encoder.get_feature_names()

array(['x0_/name/nm0000044', 'x0_/name/nm0000078', 'x0_/name/nm0000115',
       'x0_/name/nm0000134', 'x0_/name/nm0000142', 'x0_/name/nm0000158',
       'x0_/name/nm0000230', 'x0_/name/nm0000243', 'x0_/name/nm0000246',
       'x0_/name/nm0000323', 'x0_/name/nm0000329', 'x0_/name/nm0000821',
       'x0_/name/nm0001191', 'x0_/name/nm0006795', 'x0_/name/nm0222426',
       'x0_/name/nm0451321', 'x0_/name/nm0474774', 'x0_/name/nm0482320',
       'x0_/name/nm0839017', 'x0_infrequent_sklearn'], dtype=object)

In [57]:
metadata.columns

Index(['main_genre', 'original_title', 'poster_url', 'actors',
       'imdb_recommendations', 'reviews_collected_flg', 'rating', 'num_votes',
       'user_review_num', 'critic_review_num', 'metascore', 'genre_1',
       'genre_2', 'genre_3', 'release_date', 'also_known_as', 'runtime',
       'country_of_origin_1', 'country_of_origin_2', 'country_of_origin_3',
       'original_language', 'production_company_1', 'production_company_2',
       'production_company_3', 'filming_location', 'filming_country', 'budget',
       'boxoffice_gross_domestic', 'boxoffice_gross_opening',
       'boxoffice_gross_worldwide'],
      dtype='object')

In [745]:
from typing import List, Any


def find_boxoffice_quantile_range(x, q):
    for i, j, l, r in zip(q, q[1:], q.index, q.index[1:]):
        if i <= x < j:
            return f'boxoffice_from_q{l}_to_q{r}'
    return 'boxoffice_unknown_q'


def concat_genres(row):
    genre = [row['genre_1']]
    if row['genre_2']:
        genre += [row['genre_2']]
    if row['genre_3']:
        genre += [row['genre_3']]
    return ', '.join(sorted(genre))


def get_runtime_bin_labels(bins: List[float]) -> List[str]:
    return [
        f'runtime_from_q{100*l:.0f}_to_q{100*r:.0f}'
        for l, r in zip(bins, bins[1:])
    ]



TransformerHyperparams = {
    'n_actor_feat': 10,
    'n_genre_feat': 20,
    'n_prod_comp_feat': 20,
    'n_country_feat': 20,
    'n_lang_feat': 20,
    'boxoffice_quantiles': [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1],
    'runtime_quantiles': [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1],
    'rating_bins': [0, 4, 6, 8, 10]
}

class SimpleFeatureTransformer:
    def __init__(
        self,
        n_actor_feat: int,
        n_genre_feat: int,
        n_prod_comp_feat: int,
        n_country_feat: int,
        n_lang_feat: int,
        boxoffice_quantiles: List[float],
        runtime_quantiles: List[float],
        rating_bins: List[int]
    ):
        self.n_actor_feat = n_actor_feat
        self.n_genre_feat = n_genre_feat
        self.n_prod_comp_feat = n_prod_comp_feat
        self.n_country_feat = n_country_feat
        self.n_lang_feat = n_lang_feat
        self.boxoffice_quantiles = boxoffice_quantiles
        self.runtime_quantiles = runtime_quantiles
        self.rating_bins = rating_bins

    @staticmethod
    def get_genre_features(
        metadata: pd.DataFrame,
        n_categories: int
    ) -> pd.DataFrame:
        genre_comb = (
            metadata[['genre_1', 'genre_2', 'genre_3']]
            .apply(concat_genres, axis=1)
            .to_frame()
        )
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(genre_comb)
        features = pd.DataFrame(
            data=encoder.transform(genre_comb).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        return features

    @staticmethod
    def get_release_decade_features(metadata: pd.DataFrame) -> pd.DataFrame:
        release_decade = (
            metadata['release_date']
            .dt.year.round(-1)
            .to_frame()
        )
        encoder = OneHotEncoder()
        encoder.fit(release_decade)
        features = pd.DataFrame(
            data=encoder.transform(release_decade).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns = features.columns.str[:-2]
        features = features.add_prefix('released_in_').add_suffix('s')
        features.columns.values[-1] = 'unknown_release_decade'
        return features.astype('int8')

    @staticmethod
    def get_actor_features(
        metadata: pd.DataFrame,
        n_categories: int
    ) -> pd.DataFrame:
        top_n = 3
        actors = [
            (list(item.values()) + ['']*(top_n - len(item)))[:top_n]
            for item in metadata['actors']
        ]
        actors = pd.DataFrame.from_records(actors, index=metadata.index)

        encoder = OneHotEncoder(max_categories=n_categories)

        features = []
        for i in range(top_n):
            encoder.fit(actors[i].to_frame())
            feat = pd.DataFrame(
                data=encoder.transform(actors[i].to_frame()).todense(),
                index=metadata.index,
                columns=encoder.get_feature_names()
            ).drop('x0_infrequent_sklearn', axis=1)
            features.append(feat)

        features = pd.concat(features, axis=1)
        features = features[features.columns.unique()]
        features['another'] = features.sum(axis=1) == 0
        features.columns = features.columns.str.removeprefix('x0_/name/')
        return (
            features
            .astype('int8')
            .add_prefix('actor_')
            .drop(columns='actor_x0_', errors='ignore')
        )
        
    @staticmethod
    def get_country_features(
        metadata: pd.DataFrame,
        n_categories: int
    ) -> pd.DataFrame:
        country = metadata['country_of_origin_1'].to_frame()
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(country)
        features = pd.DataFrame(
            data=encoder.transform(country).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns.values[-1] = 'infrequent_country'
        return features.add_prefix('originated_in_').astype('int8')

    @staticmethod
    def get_language_features(metadata: pd.DataFrame, n_categories: int) -> pd.DataFrame:
        language = metadata['original_language'].to_frame()
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(language)
        features = pd.DataFrame(
            data=encoder.transform(language).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns.values[-1] = 'infrequent'
        return features.add_suffix('_language').astype('int8')

    @staticmethod
    def get_rating_features(metadata: pd.DataFrame, bins: List[int]) -> pd.DataFrame:
        rating_cat_mapping = {
            f'({l}, {r}]': f'rating_from_{l}_to_{r}'
            for l, r in zip(bins, bins[1:])
        }
        rating = (
            pd.cut(metadata['rating'], bins)
            .astype(str)
            .apply(lambda x: rating_cat_mapping.get(x))
            .to_frame()
        )
        encoder = OneHotEncoder()
        encoder.fit(rating)
        features = pd.DataFrame(
            data=encoder.transform(rating).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        return features.astype('int8')

    @staticmethod
    def get_prod_company_features(metadata: pd.DataFrame, n_categories: int) -> pd.DataFrame:
        company = metadata['production_company_1'].to_frame()
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(company)
        features = pd.DataFrame(
            data=encoder.transform(company).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns.values[-1] = 'infrequent_company'
        if 'None' in features.columns:
            features = features.rename({'None': 'unknown_company'}, axis=1)
        return features.add_prefix('produced_by_').astype('int8')        

    @staticmethod
    def get_boxoffice_features(
        metadata: pd.DataFrame,
        quantiles: List[float]
    ) -> pd.DataFrame:
        dollars = metadata[metadata['boxoffice_gross_worldwide'].str[0] == '$']
        dollars['decade'] = dollars['release_date'].dt.year.round(-1)
        dollars['boxoffice'] = (
            dollars['boxoffice_gross_worldwide']
            .str.replace('$', '')
            .str.replace(',', '')
            .astype('int')
        )
        dollars_pivot = (
            dollars
            .groupby('decade')
            ['boxoffice']
            .quantile(quantiles, interpolation='nearest')
            .reset_index()
            .assign(quantile=lambda x: (100*x['level_1']).astype(int))
            .pivot(index='decade', columns='quantile', values='boxoffice')
        )
        dollars_pivot[100] += 1
        
        boxoffice_cat = (
            dollars[['boxoffice', 'decade']]
            .apply(lambda x: find_boxoffice_quantile_range(
                x['boxoffice'], dollars_pivot.loc[x['decade']]
            ), axis=1)
        )
        boxoffice_cat.name = 'boxoffice_category'
        unknown_boxoffice_cat = (
            metadata[~metadata.index.isin(boxoffice_cat.index)]
            .assign(boxoffice_category='boxoffice_unknown_q')
            ['boxoffice_category']
        )
        boxoffice = pd.concat([boxoffice_cat, unknown_boxoffice_cat])
        return pd.get_dummies(boxoffice)

    @staticmethod
    def get_runtime_features(metadata, quantiles: List[float]) -> pd.DataFrame:
        labels = get_runtime_bin_labels(quantiles)
        runtime_features = pd.qcut(metadata['runtime'], quantiles, labels=labels)
        return pd.get_dummies(runtime_features)

    def transform(self, metadata: pd.DataFrame) -> pd.DataFrame:
        metadata_ = metadata.copy(deep=False)
        features = pd.concat([
            self.get_boxoffice_features(metadata_, self.boxoffice_quantiles),
            self.get_actor_features(metadata_, self.n_actor_feat),
            self.get_country_features(metadata_, self.n_country_feat),
            self.get_genre_features(metadata_, self.n_genre_feat),
            self.get_language_features(metadata_, self.n_lang_feat),
            self.get_prod_company_features(metadata_, self.n_prod_comp_feat),
            self.get_rating_features(metadata_, self.rating_bins),
            self.get_release_decade_features(metadata_),
            self.get_runtime_features(metadata_, self.runtime_quantiles)
        ], axis=1)
        return features.astype('uint8')

In [751]:
threshold = 5_000
metadata_truncated = metadata[metadata['num_votes'] > threshold]
feats = SimpleFeatureTransformer(**TransformerHyperparams).transform(metadata_truncated)
feats.shape

(14502, 136)

In [752]:
%%time
feat_mat = linear_kernel(feats)

CPU times: user 3.36 s, sys: 921 ms, total: 4.28 s
Wall time: 1.45 s


In [754]:
feat_mat

array([[9., 4., 6., ..., 2., 3., 2.],
       [4., 9., 4., ..., 4., 4., 4.],
       [6., 4., 9., ..., 2., 3., 2.],
       ...,
       [2., 4., 2., ..., 9., 8., 7.],
       [3., 4., 3., ..., 8., 9., 6.],
       [2., 4., 2., ..., 7., 6., 9.]])

In [749]:
metadata_truncated.shape

(14502, 30)

In [683]:
%%time
feats_mat = feats.dot(feats.T)
feats_mat.shape

CPU times: user 8.07 s, sys: 51 ms, total: 8.12 s
Wall time: 8.12 s


(9530, 9530)

In [702]:
%%time
cos_feats_mat = cosine_similarity(feats)
cos_feats_mat = pd.DataFrame(cos_feats_mat, index=feats.index, columns=feats.index)
cos_feats_mat.shape

CPU times: user 1.82 s, sys: 614 ms, total: 2.44 s
Wall time: 519 ms


(9530, 9530)

In [777]:
metadata_truncated[metadata_truncated['original_title'] == 'Toy Story']

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0114709/,Animation,Toy Story,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,"{'1': '/name/nm0000158', '2': '/name/nm0000741...","{'1': '/title/tt0435761/', '2': '/title/tt0120...",1,8.3,973000,742.0,159.0,...,English,Walt Disney Pictures,Pixar Animation Studios,,,,"$30,000,000 (estimated)","$223,225,679","$29,140,617","$394,436,586"


In [778]:
titles = [
    '/title/tt7286456/',
    '/title/tt0172495/',
    '/title/tt0468569/',
    '/title/tt0068646/',
    '/title/tt0114709/'
]
titles

['/title/tt7286456/',
 '/title/tt0172495/',
 '/title/tt0468569/',
 '/title/tt0068646/',
 '/title/tt0114709/']

In [787]:
%%time
cos_feats_mat.loc[set(cos_feats_mat.index) - set(titles)]

CPU times: user 91.6 ms, sys: 125 ms, total: 217 ms
Wall time: 245 ms


Unnamed: 0,/title/tt0010323/,/title/tt0012349/,/title/tt0013442/,/title/tt0015648/,/title/tt0015864/,/title/tt0016220/,/title/tt0017075/,/title/tt0017136/,/title/tt0018455/,/title/tt0018578/,...,/title/tt9701940/,/title/tt9701942/,/title/tt9777644/,/title/tt9783600/,/title/tt9784456/,/title/tt9845564/,/title/tt9848626/,/title/tt9860728/,/title/tt9866072/,/title/tt9898858/
/title/tt0278488/,0.333333,0.666667,0.555556,0.444444,0.777778,0.555556,0.555556,0.333333,0.444444,0.666667,...,0.666667,0.666667,0.666667,0.444444,0.333333,0.444444,0.333333,0.444444,0.555556,0.444444
/title/tt0108037/,0.222222,0.555556,0.333333,0.333333,0.555556,0.444444,0.333333,0.222222,0.222222,0.555556,...,0.555556,0.555556,0.555556,0.444444,0.222222,0.444444,0.333333,0.444444,0.555556,0.333333
/title/tt0762114/,0.111111,0.444444,0.222222,0.111111,0.555556,0.333333,0.333333,0.111111,0.333333,0.333333,...,0.333333,0.333333,0.333333,0.444444,0.444444,0.111111,0.444444,0.555556,0.444444,0.444444
/title/tt1416801/,0.444444,0.555556,0.444444,0.444444,0.555556,0.444444,0.333333,0.333333,0.333333,0.666667,...,0.666667,0.666667,0.666667,0.555556,0.333333,0.555556,0.333333,0.555556,0.666667,0.444444
/title/tt0092099/,0.222222,0.444444,0.333333,0.333333,0.444444,0.555556,0.222222,0.333333,0.222222,0.555556,...,0.666667,0.666667,0.555556,0.333333,0.222222,0.333333,0.333333,0.333333,0.444444,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/title/tt0095188/,0.333333,0.555556,0.333333,0.333333,0.555556,0.333333,0.444444,0.222222,0.333333,0.555556,...,0.555556,0.555556,0.555556,0.555556,0.333333,0.444444,0.333333,0.555556,0.666667,0.444444
/title/tt0120630/,0.444444,0.444444,0.333333,0.444444,0.333333,0.333333,0.444444,0.333333,0.222222,0.444444,...,0.444444,0.444444,0.444444,0.333333,0.444444,0.333333,0.444444,0.333333,0.444444,0.444444
/title/tt0437863/,0.333333,0.777778,0.333333,0.444444,0.666667,0.333333,0.333333,0.333333,0.333333,0.555556,...,0.555556,0.555556,0.555556,0.555556,0.444444,0.333333,0.555556,0.555556,0.444444,0.666667
/title/tt0090583/,0.222222,0.444444,0.222222,0.222222,0.444444,0.333333,0.333333,0.111111,0.222222,0.444444,...,0.555556,0.555556,0.444444,0.333333,0.222222,0.222222,0.333333,0.333333,0.444444,0.333333


In [794]:
%%time
cos_feats_mat[titles].drop(titles)

CPU times: user 3.81 ms, sys: 368 µs, total: 4.18 ms
Wall time: 3.94 ms


Unnamed: 0,/title/tt7286456/,/title/tt0172495/,/title/tt0468569/,/title/tt0068646/,/title/tt0114709/
/title/tt0010323/,0.111111,0.111111,0.111111,0.111111,0.111111
/title/tt0012349/,0.444444,0.444444,0.444444,0.555556,0.444444
/title/tt0013442/,0.111111,0.111111,0.111111,0.222222,0.000000
/title/tt0015648/,0.111111,0.111111,0.111111,0.222222,0.111111
/title/tt0015864/,0.444444,0.444444,0.444444,0.555556,0.333333
...,...,...,...,...,...
/title/tt9845564/,0.222222,0.111111,0.111111,0.222222,0.000000
/title/tt9848626/,0.444444,0.333333,0.333333,0.333333,0.444444
/title/tt9860728/,0.444444,0.333333,0.333333,0.333333,0.222222
/title/tt9866072/,0.444444,0.333333,0.333333,0.333333,0.222222


In [796]:
cos_feats_mat[titles].drop(titles)

Unnamed: 0,/title/tt7286456/,/title/tt0172495/,/title/tt0468569/,/title/tt0068646/,/title/tt0114709/
/title/tt0010323/,0.111111,0.111111,0.111111,0.111111,0.111111
/title/tt0012349/,0.444444,0.444444,0.444444,0.555556,0.444444
/title/tt0013442/,0.111111,0.111111,0.111111,0.222222,0.000000
/title/tt0015648/,0.111111,0.111111,0.111111,0.222222,0.111111
/title/tt0015864/,0.444444,0.444444,0.444444,0.555556,0.333333
...,...,...,...,...,...
/title/tt9845564/,0.222222,0.111111,0.111111,0.222222,0.000000
/title/tt9848626/,0.444444,0.333333,0.333333,0.333333,0.444444
/title/tt9860728/,0.444444,0.333333,0.333333,0.333333,0.222222
/title/tt9866072/,0.444444,0.333333,0.333333,0.333333,0.222222


In [801]:
top_n

30

In [803]:
cos_feats_mat[titles[0]].nlargest(30)

/title/tt7286456/    1.000000
/title/tt0070047/    0.777778
/title/tt0133093/    0.777778
/title/tt0407887/    0.777778
/title/tt0448115/    0.777778
/title/tt1745960/    0.777778
/title/tt2179136/    0.777778
/title/tt3315342/    0.777778
/title/tt3741700/    0.777778
/title/tt6966692/    0.777778
/title/tt0040897/    0.666667
/title/tt0070735/    0.666667
/title/tt0073195/    0.666667
/title/tt0073486/    0.666667
/title/tt0076759/    0.666667
/title/tt0080684/    0.666667
/title/tt0086190/    0.666667
/title/tt0096895/    0.666667
/title/tt0097165/    0.666667
/title/tt0097576/    0.666667
/title/tt0100404/    0.666667
/title/tt0102926/    0.666667
/title/tt0103064/    0.666667
/title/tt0103776/    0.666667
/title/tt0106977/    0.666667
/title/tt0107290/    0.666667
/title/tt0109635/    0.666667
/title/tt0112462/    0.666667
/title/tt0114369/    0.666667
/title/tt0118688/    0.666667
Name: /title/tt7286456/, dtype: float64

In [815]:
%%time
candidate_titles = []
cos_feat_mat_ = cos_feats_mat[titles].drop(titles)
for title in titles:
    candidate_titles.append(cos_feat_mat_[title].nlargest(top_n))
top_titles = pd.concat(candidate_titles).groupby(level=0).last().nlargest(top_n)

CPU times: user 12.3 ms, sys: 5.53 ms, total: 17.8 ms
Wall time: 17.1 ms


In [817]:
top_titles.to_dict()

{'/title/tt0071562/': 0.8888888888888891,
 '/title/tt0372784/': 0.8888888888888891,
 '/title/tt0407887/': 0.8888888888888891,
 '/title/tt0816692/': 0.8888888888888891,
 '/title/tt1375666/': 0.8888888888888891,
 '/title/tt0031381/': 0.7777777777777779,
 '/title/tt0036868/': 0.7777777777777779,
 '/title/tt0044672/': 0.7777777777777779,
 '/title/tt0052618/': 0.7777777777777779,
 '/title/tt0059742/': 0.7777777777777779,
 '/title/tt0062622/': 0.7777777777777779,
 '/title/tt0066206/': 0.7777777777777779,
 '/title/tt0070047/': 0.7777777777777779,
 '/title/tt0070735/': 0.7777777777777779,
 '/title/tt0071315/': 0.7777777777777779,
 '/title/tt0072308/': 0.7777777777777779,
 '/title/tt0078788/': 0.7777777777777779,
 '/title/tt0082971/': 0.7777777777777779,
 '/title/tt0097576/': 0.7777777777777779,
 '/title/tt0113277/': 0.7777777777777779,
 '/title/tt0120737/': 0.7777777777777779,
 '/title/tt0120855/': 0.7777777777777779,
 '/title/tt0167260/': 0.7777777777777779,
 '/title/tt0167261/': 0.7777777777

In [766]:
cos_feats_mat

Unnamed: 0,/title/tt0010323/,/title/tt0012349/,/title/tt0013442/,/title/tt0015648/,/title/tt0015864/,/title/tt0016220/,/title/tt0017075/,/title/tt0017136/,/title/tt0018455/,/title/tt0018578/,...,/title/tt9701940/,/title/tt9701942/,/title/tt9777644/,/title/tt9783600/,/title/tt9784456/,/title/tt9845564/,/title/tt9848626/,/title/tt9860728/,/title/tt9866072/,/title/tt9898858/
/title/tt0010323/,1.000000,0.444444,0.666667,0.555556,0.333333,0.333333,0.333333,0.444444,0.222222,0.333333,...,0.333333,0.333333,0.333333,0.222222,0.222222,0.333333,0.222222,0.222222,0.333333,0.333333
/title/tt0012349/,0.444444,1.000000,0.444444,0.555556,0.888889,0.444444,0.333333,0.444444,0.444444,0.555556,...,0.555556,0.555556,0.555556,0.444444,0.333333,0.333333,0.444444,0.444444,0.444444,0.555556
/title/tt0013442/,0.666667,0.444444,1.000000,0.555556,0.555556,0.555556,0.444444,0.555556,0.333333,0.444444,...,0.444444,0.444444,0.444444,0.222222,0.222222,0.444444,0.111111,0.222222,0.333333,0.222222
/title/tt0015648/,0.555556,0.555556,0.555556,1.000000,0.444444,0.444444,0.333333,0.333333,0.333333,0.444444,...,0.444444,0.444444,0.444444,0.222222,0.222222,0.444444,0.222222,0.222222,0.333333,0.333333
/title/tt0015864/,0.333333,0.888889,0.555556,0.444444,1.000000,0.555556,0.444444,0.444444,0.555556,0.555556,...,0.555556,0.555556,0.555556,0.444444,0.333333,0.333333,0.333333,0.444444,0.444444,0.444444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/title/tt9845564/,0.333333,0.333333,0.444444,0.444444,0.333333,0.333333,0.333333,0.333333,0.222222,0.444444,...,0.666667,0.666667,0.666667,0.555556,0.444444,1.000000,0.333333,0.555556,0.666667,0.444444
/title/tt9848626/,0.222222,0.444444,0.111111,0.222222,0.333333,0.222222,0.111111,0.111111,0.222222,0.333333,...,0.555556,0.555556,0.555556,0.666667,0.555556,0.333333,1.000000,0.666667,0.555556,0.777778
/title/tt9860728/,0.222222,0.444444,0.222222,0.222222,0.444444,0.222222,0.222222,0.222222,0.333333,0.444444,...,0.666667,0.666667,0.666667,0.888889,0.777778,0.555556,0.666667,1.000000,0.888889,0.777778
/title/tt9866072/,0.333333,0.444444,0.333333,0.333333,0.444444,0.333333,0.333333,0.222222,0.333333,0.555556,...,0.777778,0.777778,0.777778,0.777778,0.666667,0.666667,0.555556,0.888889,1.000000,0.666667


In [705]:
cos_feats_mat.loc[selected_titles].T.nlargest(top_n + 1, '/title/tt0068646/')[1:]

Unnamed: 0,/title/tt0068646/
/title/tt0071562/,0.888889
/title/tt0816692/,0.888889
/title/tt0031381/,0.777778
/title/tt0036868/,0.777778
/title/tt0044672/,0.777778
/title/tt0052618/,0.777778
/title/tt0059742/,0.777778
/title/tt0062622/,0.777778
/title/tt0066206/,0.777778
/title/tt0070047/,0.777778


In [707]:
%%time
selected_titles = [
    '/title/tt0068646/',
    # '/title/tt0372784/',
    # '/title/tt0468569/'
]
top_n = 30
recommended_title_id = cos_feats_mat.loc[selected_titles].T.nlargest(top_n + 1, '/title/tt0068646/')[1:]
metadata_truncated.loc[recommended_title_id.index]

CPU times: user 2.9 ms, sys: 743 µs, total: 3.64 ms
Wall time: 3.3 ms


Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0071562/,Crime,The Godfather: Part II,https://m.media-amazon.com/images/M/MV5BMWMwMG...,"{'1': '/name/nm0000199', '2': '/name/nm0000134...","{'1': '/title/tt0068646/', '2': '/title/tt0099...",1,9.0,1200000,1200.0,187.0,...,English,Paramount Pictures,The Coppola Company,American Zoetrope,"Kaiser Estate, 4000 W Lake Blvd, Homewood, Lak...",USA,"$13,000,000 (estimated)","$47,834,595","$171,417","$47,961,919"
/title/tt0816692/,Adventure,Interstellar,https://m.media-amazon.com/images/M/MV5BZjdkOT...,"{'1': '/name/nm0000190', '2': '/name/nm0004266...","{'1': '/title/tt1375666/', '2': '/title/tt0468...",1,8.6,1800000,5100.0,633.0,...,English,Paramount Pictures,Warner Bros.,Legendary Entertainment,Iceland,Iceland,"$165,000,000 (estimated)","$188,020,017","$47,510,360","$701,729,206"
/title/tt0031381/,Drama,Gone with the Wind,https://m.media-amazon.com/images/M/MV5BYjUyZW...,"{'1': '/name/nm0000022', '2': '/name/nm0000046...","{'1': '/title/tt0034583/', '2': '/title/tt0052...",1,8.2,312000,965.0,192.0,...,English,Selznick International Pictures,Metro-Goldwyn-Mayer (MGM),,"North Little Rock, Arkansas, USA",USA,"$3,977,000 (estimated)","$200,882,193","$1,192,593","$402,382,193"
/title/tt0036868/,Drama,The Best Years of Our Lives,https://m.media-amazon.com/images/M/MV5BY2RmNT...,"{'1': '/name/nm0001485', '2': '/name/nm0000763...","{'1': '/title/tt0032551/', '2': '/title/tt0035...",1,8.1,64000,333.0,113.0,...,English,The Samuel Goldwyn Company,,,Ontario International Airport - 2900 E. Airpor...,USA,"$2,100,000 (estimated)","$23,650,000",,"$23,656,620"
/title/tt0044672/,Drama,The Greatest Show on Earth,https://m.media-amazon.com/images/M/MV5BMzg5MW...,"{'1': '/name/nm0000071', '2': '/name/nm0000032...","{'1': '/title/tt0048960/', '2': '/title/tt0033...",1,6.6,15000,146.0,63.0,...,English,Paramount Pictures,,,"Sarasota, Florida, USA",USA,"$4,000,000 (estimated)","$36,000,000",,"$36,000,000"
/title/tt0052618/,Adventure,Ben-Hur,https://m.media-amazon.com/images/M/MV5BNjgxY2...,"{'1': '/name/nm0000032', '2': '/name/nm0370144...","{'1': '/title/tt0056172/', '2': '/title/tt0031...",1,8.1,237000,493.0,134.0,...,English,Metro-Goldwyn-Mayer (MGM),,,Corner of Via di Salone and Via delle Case Ros...,Italy,"$15,000,000 (estimated)","$74,432,704","$241,792","$74,437,720"
/title/tt0059742/,Biography,The Sound of Music,https://m.media-amazon.com/images/M/MV5BODIxNj...,"{'1': '/name/nm0000267', '2': '/name/nm0001626...","{'1': '/title/tt0058331/', '2': '/title/tt0032...",1,8.1,229000,529.0,136.0,...,English,Robert Wise Productions,Argyle Enterprises,,"Felsenreitschule, Salzburg, Austria",Austria,"$8,200,000 (estimated)","$159,287,539","$413,497","$159,428,329"
/title/tt0062622/,Adventure,2001: A Space Odyssey,https://m.media-amazon.com/images/M/MV5BMmNlYz...,"{'1': '/name/nm0001158', '2': '/name/nm0516972...","{'1': '/title/tt0066921/', '2': '/title/tt0081...",1,8.3,656000,2400.0,286.0,...,English,Metro-Goldwyn-Mayer (MGM),Stanley Kubrick Productions,,"Isle of Harris, Western Isles, Scotland, UK",UK,"$12,000,000 (estimated)","$60,481,243","$202,759","$65,877,808"
/title/tt0066206/,Biography,Patton,https://m.media-amazon.com/images/M/MV5BMmNhZm...,"{'1': '/name/nm0001715', '2': '/name/nm0001500...","{'1': '/title/tt0066473/', '2': '/title/tt0067...",1,7.9,102000,330.0,120.0,...,English,Twentieth Century Fox,,,"Cabo de Gata, Almería, Andalucía, Spain",Spain,"$12,000,000 (estimated)","$61,749,765",,"$61,749,765"
/title/tt0070047/,Horror,The Exorcist,https://m.media-amazon.com/images/M/MV5BYWFlZG...,"{'1': '/name/nm0000995', '2': '/name/nm0001884...","{'1': '/title/tt0073195/', '2': '/title/tt0088...",1,8.1,396000,1300.0,231.0,...,English,Warner Bros.,Hoya Productions,,"Mosul, Iraq",Iraq,"$11,000,000 (estimated)","$233,005,644","$8,175,666","$441,306,145"


In [None]:
selected_titles = [
    '/title/tt0068646/',
    '/title/tt0372784/',
    '/title/tt0468569/'
]

In [531]:
metadata_truncated.query('original_title == "The Godfather"').index[0]

'/title/tt0068646/'

In [687]:
metadata_truncated.query('original_title == "Batman Begins"').index[0]

'/title/tt0372784/'

In [688]:
metadata_truncated.query('original_title == "The Dark Knight"').index[0]

'/title/tt0468569/'

In [532]:
target_title = metadata_truncated.query('original_title == "The Godfather"').index[0]
target_feats = feats.loc[target_title].values.reshape(1, -1)
target_feats.shape

(1, 136)

In [819]:
os.listdir('..')

['.DS_Store',
 'recsys.egg-info',
 'requirements.txt',
 'Dockerfile',
 'tests',
 'recsys',
 'docs',
 'README.md',
 'setup.py',
 'logs',
 '.gitignore',
 '.env',
 'scripts',
 '.git',
 '.vscode',
 'data',
 'notebooks']

In [533]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [534]:
dotprod = linear_kernel(feats, target_feats)
sim = cosine_similarity(feats, target_feats)

In [756]:
from typing import List, Literal

def get_similar_titles(
    target_titles: List[str],
    features: pd.DataFrame,
    metadata: pd.DataFrame,
    top_k: int = 10,
    sim_metric: Literal['linear', 'cosine'] = 'cosine'
) -> pd.Index:
    mask = metadata['original_title'].isin(target_titles)
    mask_idx = mask[mask == True].index

    if len(mask_idx) == 0:
        return []

    target_features = features.loc[mask_idx]
    source_features = features.drop(mask_idx)
    if sim_metric == 'linear':
        sim = linear_kernel(
            source_features.values,
            target_features.values
        )
    elif sim_metric == 'cosine':
        sim = cosine_similarity(
            source_features.values,
            target_features.values
        )
    else:
        raise ValueError(f'Unknown similarity metric: {sim_metric}')

    sim = sim.flatten()
    top_title_idx = np.argpartition(sim, len(sim) - top_k)[-top_k:]
    return source_features.iloc[top_title_idx].index, sim[top_title_idx]

#np.argpartition(sim, len(sim) - top_k)[-top_k:]

In [652]:
from Levenshtein import distance

In [654]:
from scipy.spatial.distance import cdist

In [None]:
['The Godfather', 'Batman Begins', "Harry Potter and the Sorcerer's Stone"]

In [710]:
metadata_truncated['original_title'].values

array(['The Birth of a Nation', 'Intolerance',
       'The Cabinet of Dr. Caligari', ..., 'I Care a Lot',
       'Coffee & Kareem', 'Kaithi'], dtype=object)

In [720]:
distance?

[0;31mDocstring:[0m
Compute absolute Levenshtein distance of two strings.

distance(string1, string2)

Examples (it's hard to spell Levenshtein correctly):

>>> distance('Levenshtein', 'Lenvinsten')
4
>>> distance('Levenshtein', 'Levensthein')
2
>>> distance('Levenshtein', 'Levenshten')
1
>>> distance('Levenshtein', 'Levenshtein')
0

Yeah, we've managed it at last.
[0;31mType:[0m      builtin_function_or_method


In [724]:
distance()

TypeError: distance expected two Strings or two Unicodes

In [728]:
%%time
xa = np.array(['The Godfather', 'Batman Begins', "Harry Potter and the Sorcerer's Stone"])
xb = metadata_truncated['original_title'].values.flatten()
res = np.zeros((xa.shape[0], xb.shape[0]))
for num1, i in enumerate(xa):
    for num2, j in enumerate(xb):
        res[num1, num2] = distance(i, j)

CPU times: user 25.1 ms, sys: 332 µs, total: 25.4 ms
Wall time: 25.1 ms


In [743]:
np.vectorize(distance)

<numpy.vectorize at 0x15b2f6070>

In [744]:
pairwise_distances(xa, xb, metric=np.vectorize(distance))

ValueError: could not convert string to float: 'The Godfather'

In [738]:
from sklearn.metrics import pairwise_distances

In [734]:
res.argmin(axis=1)

array([ 768, 4307, 3522])

In [737]:
xb[3522]

"Harry Potter and the Sorcerer's Stone"

In [731]:
res.min()

0.0

In [721]:
xa = np.array(['The Godfather', 'Batman Begins', "Harry Potter and the Sorcerer's Stone"]).reshape(-1, 1)
xb = metadata_truncated['original_title'].values.reshape(-1, 1)
cdist(xa, xb, metrics=lambda u, v: distance(u, v))

TypeError: cdist_euclidean(): incompatible function arguments. The following argument types are supported:
    1. (x: object, y: object, w: object = None, out: object = None) -> numpy.ndarray

Invoked with: array([['The Godfather'],
       ['Batman Begins'],
       ["Harry Potter and the Sorcerer's Stone"]], dtype='<U37'), array([['The Birth of a Nation'],
       ['Intolerance'],
       ['The Cabinet of Dr. Caligari'],
       ...,
       ['I Care a Lot'],
       ['Coffee & Kareem'],
       ['Kaithi']], dtype=object); kwargs: out=None, metrics=<function <lambda> at 0x15b1368b0>

In [653]:
distance

[0;31mDocstring:[0m
Compute absolute Levenshtein distance of two strings.

distance(string1, string2)

Examples (it's hard to spell Levenshtein correctly):

>>> distance('Levenshtein', 'Lenvinsten')
4
>>> distance('Levenshtein', 'Levensthein')
2
>>> distance('Levenshtein', 'Levenshten')
1
>>> distance('Levenshtein', 'Levenshtein')
0

Yeah, we've managed it at last.
[0;31mType:[0m      builtin_function_or_method


In [658]:
from thefuzz import fuzz
from thefuzz import process

In [649]:
target = 'the godphather'
choices = metadata['original_title'].values

In [661]:
fuzz.ratio(target, 'The Godfather')

74

In [662]:
fuzz.ratio(target, 'Go')

12

In [659]:
process.extract?

[0;31mSignature:[0m
[0mprocess[0m[0;34m.[0m[0mextract[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mquery[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchoices[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprocessor[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mfull_process[0m [0mat[0m [0;36m0x1578f2e50[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscorer[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mWRatio[0m [0mat[0m [0;36m0x157130f70[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Select the best match in a list or dictionary of choices.

Find best matches in a list or dictionary of choices, return a
list of tuples containing the match and its score. If a dictionary
is used, also returns the key for each match.

Arguments:
    query: An object representing the thing we want to find.
    choices: An iterable or dicti

In [664]:
process.extractOne(target, list(choices), scorer=fuzz.ratio)

('The Godfather', 89)

In [668]:
15000*15000

225000000

In [637]:
m = metadata['original_title'].str.contains('Harry Potter')
metadata[m]

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0241527/,Adventure,Harry Potter and the Sorcerer's Stone,https://m.media-amazon.com/images/M/MV5BMzkyZG...,"{'1': '/name/nm0705356', '2': '/name/nm0342488...","{'1': '/title/tt0295297/', '2': '/title/tt0304...",1,7.6,759000,1900.0,278.0,...,English,Warner Bros.,Heyday Films,1492 Pictures,"Alnwick Castle, Alnwick, Northumberland, Engla...",UK,"$125,000,000 (estimated)","$318,886,962","$90,294,621","$1,022,290,019"
/title/tt0295297/,Adventure,Harry Potter and the Chamber of Secrets,https://m.media-amazon.com/images/M/MV5BMjE0Yj...,"{'1': '/name/nm0705356', '2': '/name/nm0342488...","{'1': '/title/tt0304141/', '2': '/title/tt0330...",1,7.4,620000,1000.0,244.0,...,English,Warner Bros.,Heyday Films,1492 Pictures,"Glenfinnan Viaduct, Fort William, Highland, Sc...",UK,"$100,000,000 (estimated)","$262,641,637","$88,357,488","$879,928,511"
/title/tt0304141/,Adventure,Harry Potter and the Prisoner of Azkaban,https://m.media-amazon.com/images/M/MV5BMTY4NT...,"{'1': '/name/nm0705356', '2': '/name/nm0914612...","{'1': '/title/tt0295297/', '2': '/title/tt0330...",1,7.9,620000,1700.0,291.0,...,English,Warner Bros.,1492 Pictures,Heyday Films,"Glenfinnan, Highland, Scotland, UK",UK,"$130,000,000 (estimated)","$250,105,651","$93,687,367","$797,568,607"
/title/tt0330373/,Adventure,Harry Potter and the Goblet of Fire,https://m.media-amazon.com/images/M/MV5BMTI1ND...,"{'1': '/name/nm0705356', '2': '/name/nm0914612...","{'1': '/title/tt0373889/', '2': '/title/tt0304...",1,7.7,613000,2000.0,314.0,...,English,Warner Bros.,Heyday Films,Patalex IV Productions Limited,"Glenfinnan Viaduct, Fort William, Highland, Sc...",UK,"$150,000,000 (estimated)","$290,469,928","$102,685,961","$896,730,264"
/title/tt0373889/,Action,Harry Potter and the Order of the Phoenix,https://m.media-amazon.com/images/M/MV5BMTM0NT...,"{'1': '/name/nm0705356', '2': '/name/nm0914612...","{'1': '/title/tt0417741/', '2': '/title/tt0330...",1,7.5,569000,1200.0,347.0,...,English,Warner Bros.,Heyday Films,Cool Music,"Blenheim Palace, Woodstock, Oxfordshire, Engla...",UK,"$150,000,000 (estimated)","$292,382,727","$77,108,414","$942,201,710"
/title/tt0417741/,Action,Harry Potter and the Half-Blood Prince,https://m.media-amazon.com/images/M/MV5BNzU3ND...,"{'1': '/name/nm0705356', '2': '/name/nm0914612...","{'1': '/title/tt0373889/', '2': '/title/tt0926...",1,7.6,533000,1100.0,392.0,...,English,Warner Bros.,Heyday Films,,"Bjorli, Norway",Norway,"$250,000,000 (estimated)","$302,334,374","$77,835,727","$934,483,039"
/title/tt0926084/,Adventure,Harry Potter and the Deathly Hallows: Part 1,https://m.media-amazon.com/images/M/MV5BMTQ2OT...,"{'1': '/name/nm0705356', '2': '/name/nm0914612...","{'1': '/title/tt0417741/', '2': '/title/tt0373...",1,7.7,536000,765.0,441.0,...,English,Warner Bros.,Heyday Films,,"Pembrokeshire, Wales, UK",UK,"£150,000,000 (estimated)","$296,374,621","$125,017,372","$977,070,383"
/title/tt1201607/,Adventure,Harry Potter and the Deathly Hallows: Part 2,https://m.media-amazon.com/images/M/MV5BMGVmMW...,"{'1': '/name/nm0705356', '2': '/name/nm0914612...","{'1': '/title/tt0325980/', '2': '/title/tt0372...",1,8.1,854000,1000.0,489.0,...,English,Warner Bros.,Heyday Films,Moving Picture Company (MPC),"Freshwater West, Pembrokeshire, Wales, UK",UK,"$125,000,000 (estimated)","$381,447,587","$169,189,427","$1,342,359,942"


In [763]:
%%time
titles = [
    #'The Godfather',
    'Batman Begins',
    # "Harry Potter and the Sorcerer's Stone"
]
similar_titles = get_similar_titles(titles, feats, metadata, 20)
similar_titles

CPU times: user 22 ms, sys: 1.82 ms, total: 23.8 ms
Wall time: 18.7 ms


(Index(['/title/tt0328107/', '/title/tt1454029/', '/title/tt0407887/',
        '/title/tt0113277/', '/title/tt0120036/', '/title/tt0172495/',
        '/title/tt0264464/', '/title/tt1375666/', '/title/tt0112573/',
        '/title/tt0266697/', '/title/tt1877830/', '/title/tt0325710/',
        '/title/tt0325980/', '/title/tt0212720/', '/title/tt1345836/',
        '/title/tt0234215/', '/title/tt0332452/', '/title/tt0468569/',
        '/title/tt0133093/', '/title/tt0405159/'],
       dtype='object'),
 array([0.66666667, 0.66666667, 0.77777778, 1.        , 0.66666667,
        0.77777778, 0.77777778, 0.77777778, 0.77777778, 0.77777778,
        0.77777778, 0.77777778, 0.77777778, 0.77777778, 0.77777778,
        0.77777778, 0.77777778, 0.88888889, 0.77777778, 0.66666667]))

In [758]:
similar_titles[0]

Index(['/title/tt0059742/', '/title/tt1959490/', '/title/tt0120382/',
       '/title/tt1950186/', '/title/tt0071562/', '/title/tt0109830/',
       '/title/tt0082971/', '/title/tt0816692/', '/title/tt0099674/',
       '/title/tt0110912/'],
      dtype='object')

In [764]:
metadata_truncated.loc[similar_titles[0]]

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0328107/,Action,Man on Fire,https://m.media-amazon.com/images/M/MV5BODFlMm...,"{'1': '/name/nm0000243', '2': '/name/nm0000686...","{'1': '/title/tt0455944/', '2': '/title/tt0453...",1,7.7,359000,867.0,158.0,...,English,Fox 2000 Pictures,New Regency Productions,Scott Free Productions,"Ciudad Juárez, Chihuahua, Mexico",Mexico,"$70,000,000 (estimated)","$77,911,774","$22,751,490","$130,834,852"
/title/tt1454029/,Drama,The Help,https://m.media-amazon.com/images/M/MV5BMTM5OT...,"{'1': '/name/nm1297015', '2': '/name/nm0205626...","{'1': '/title/tt4846340/', '2': '/title/tt3170...",1,8.1,453000,641.0,296.0,...,English,DreamWorks,Dreamworks Pictures,Reliance Film & Entertainment,"Greenwood, Mississippi, USA",USA,"$25,000,000 (estimated)","$169,708,112","$26,044,590","$216,639,112"
/title/tt0407887/,Crime,The Departed,https://m.media-amazon.com/images/M/MV5BMTI1MT...,"{'1': '/name/nm0000138', '2': '/name/nm0000354...","{'1': '/title/tt0482571/', '2': '/title/tt0114...",1,8.5,1300000,2500.0,297.0,...,English,Warner Bros.,Plan B Entertainment,Initial Entertainment Group (IEG),"SUNY Maritime College, Bronx, New York City, N...",USA,"$90,000,000 (estimated)","$132,399,394","$26,887,467","$291,480,452"
/title/tt0113277/,Action,Heat,https://m.media-amazon.com/images/M/MV5BYjZjNT...,"{'1': '/name/nm0000199', '2': '/name/nm0000134...","{'1': '/title/tt0112641/', '2': '/title/tt0086...",1,8.3,640000,1200.0,211.0,...,English,Warner Bros.,New Regency Productions,Forward Pass,"1219 Dodds Circle, East Los Angeles, Californi...",USA,"$60,000,000 (estimated)","$67,436,818","$8,445,656","$187,436,818"
/title/tt0120036/,Action,Rosewood,https://m.media-amazon.com/images/M/MV5BNGJlMz...,"{'1': '/name/nm0000685', '2': '/name/nm0000609...","{'1': '/title/tt0113305/', '2': '/title/tt0255...",1,7.2,8200,67.0,39.0,...,English,Warner Bros.,Peters Entertainment,New Deal Productions,"Sanford, Florida, USA",USA,"$25,000,000 (estimated)","$13,130,349","$3,154,075","$13,130,349"
/title/tt0172495/,Action,Gladiator,https://m.media-amazon.com/images/M/MV5BMDliMm...,"{'1': '/name/nm0000128', '2': '/name/nm0001618...","{'1': '/title/tt0120815/', '2': '/title/tt0120...",1,8.5,1500000,2800.0,219.0,...,English,Dreamworks Pictures,Universal Pictures,Scott Free Productions,Malta,Malta,"$103,000,000 (estimated)","$187,705,427","$34,819,017","$465,380,802"
/title/tt0264464/,Biography,Catch Me If You Can,https://m.media-amazon.com/images/M/MV5BMTY5Mz...,"{'1': '/name/nm0000138', '2': '/name/nm0000158...","{'1': '/title/tt1130884/', '2': '/title/tt0993...",1,8.1,950000,971.0,198.0,...,English,Dreamworks Pictures,Kemp Company,Splendid Pictures,"Alcatraz Prison, Alcatraz Island, San Francisc...",USA,"$52,000,000 (estimated)","$164,615,351","$30,053,627","$352,114,312"
/title/tt1375666/,Action,Inception,https://m.media-amazon.com/images/M/MV5BMjAxMz...,"{'1': '/name/nm0000138', '2': '/name/nm0330687...","{'1': '/title/tt0816692/', '2': '/title/tt0137...",1,8.8,2300000,4600.0,482.0,...,English,Warner Bros.,Legendary Entertainment,Syncopy,"Fortress Mountain, Kananaskis Country, Alberta...",Canada,"$160,000,000 (estimated)","$292,587,330","$62,785,337","$836,848,102"
/title/tt0112573/,Biography,Braveheart,https://m.media-amazon.com/images/M/MV5BMzkzMm...,"{'1': '/name/nm0000154', '2': '/name/nm0000521...","{'1': '/title/tt0172495/', '2': '/title/tt0120...",1,8.4,1000000,1400.0,149.0,...,English,Icon Entertainment International,The Ladd Company,B.H. Finance C.V.,"Fort William, Glen Coe, Highland, Scotland, UK",UK,"$72,000,000 (estimated)","$75,609,945","$9,938,276","$213,216,216"
/title/tt0266697/,Action,Kill Bill: Vol. 1,https://m.media-amazon.com/images/M/MV5BNzM3ND...,"{'1': '/name/nm0000235', '2': '/name/nm0001016...","{'1': '/title/tt0378194/', '2': '/title/tt0105...",1,8.2,1100000,2400.0,333.0,...,English,Miramax,A Band Apart,Super Cool ManChu,"Shinjuku, Tokyo, Japan",Japan,"$30,000,000 (estimated)","$70,099,045","$22,200,000","$180,906,076"
