In [1]:
import os
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings('ignore')

AWS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_S3_BUCKET = os.getenv('AWS_S3_BUCKET')
REVIEWS_FILE_TEMPLATE = 'reviews/reviews_partition_{}.parquet'

storage_options = {
    'key': AWS_KEY,
    'secret': AWS_SECRET_KEY
}

reviews_files = [
    os.path.join(
        's3://',
        AWS_S3_BUCKET,
        REVIEWS_FILE_TEMPLATE.format(partition + 1)
    )
    for partition in range(20)
]
metadata_file = os.path.join('s3://', AWS_S3_BUCKET, 'metadata/metadata.json')
metadata_proc_file = os.path.join('s3://', AWS_S3_BUCKET, 'metadata/metadata_features.json')

%load_ext autoreload
%autoreload 2

In [2]:
metadata = pd.read_json(metadata_proc_file, storage_options=storage_options, orient='index')
metadata = metadata.astype({'release_date': 'datetime64[D]'})
metadata.head()

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0002130/,Adventure,Dante's Inferno,https://m.media-amazon.com/images/M/MV5BNjU1ND...,"{'1': '/name/nm0660139', '2': '/name/nm0685283...","{'1': '/title/tt0003740/', '2': '/title/tt0001...",1,7.0,2900,36.0,13.0,...,Italian,Milano Film,SAFFI-Comerio,,"Bovisa, Milano, Lombardia, Italy",Italy,,,,
/title/tt0003740/,Adventure,Cabiria,https://m.media-amazon.com/images/M/MV5BZmVjNz...,"{'1': '/name/nm0021935', '2': '/name/nm0702894...","{'1': '/title/tt0006864/', '2': '/title/tt0009...",1,7.1,3600,35.0,26.0,...,Italian,Itala Film,,,"FERT Studios, Turin, Piedmont, Italy",Italy,"ITL 1,000,000 (estimated)",,,
/title/tt0004635/,Action,The Squaw Man,https://m.media-amazon.com/images/M/MV5BMjMwOD...,"{'1': '/name/nm0267914', '2': '/name/nm0758457...","{'1': '/title/tt0014532/', '2': '/title/tt0006...",1,5.7,979,9.0,4.0,...,,Jesse L. Lasky Feature Play Company,,,Hollywood Heritage Museum - 2100 North Highlan...,USA,"$20,000 (estimated)",,,
/title/tt0004707/,Comedy,Tillie's Punctured Romance,https://m.media-amazon.com/images/M/MV5BMDZjOW...,"{'1': '/name/nm0000122', '2': '/name/nm0237597...","{'1': '/title/tt0005074/', '2': '/title/tt0006...",1,6.3,3500,41.0,19.0,...,,Keystone Film Company,,,"Sans Souci Castle, Los Angeles, California, USA",USA,"$50,000 (estimated)",,,
/title/tt0004972/,Drama,The Birth of a Nation,https://m.media-amazon.com/images/M/MV5BNWZlNj...,"{'1': '/name/nm0001273', '2': '/name/nm0550615...","{'1': '/title/tt0006864/', '2': '/title/tt0009...",1,6.2,25000,379.0,79.0,...,,David W. Griffith Corp.,Epoch Producing Corporation,,"Calexico, California, USA",USA,"$110,000 (estimated)",,,


In [4]:
from tqdm.notebook import tqdm

reviews_partitions = []
for path in tqdm(reviews_files):
    reviews_partition = pd.read_parquet(path, storage_options=storage_options)
    reviews_partitions.append(reviews_partition)
    break
reviews = pd.concat(reviews_partitions)

  0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
user_item_matrix = (
    reviews
    [['id', 'author', 'rating']]
    .drop_duplicates()
    .pivot(index='author', columns='id', values='rating')
)
memusage = user_item_matrix.memory_usage(deep=True).sum() / 1024 / 1024
print(f'{memusage:.2f} Mb')

601.00 Mb


In [10]:
reviews

Unnamed: 0,index,id,text,rating,title,author,upvotes,total_votes,review_date
0,0,/title/tt0113142/,I liked this movie the best out of all the '90...,9.0,A modern day classic kaijyu movie,/user/ur3225430/,24,26,2005-10-10
1,1,/title/tt0113142/,When Gamera first appeared in Japanese theater...,10.0,a magnificently entertaining monster movie; t...,/user/ur6321000/,13,13,2009-11-23
2,2,/title/tt0113142/,"Sooner than I expected, GAMERA Trilogy COMPLET...",9.0,Didn't Disappoint !!,/user/ur20916867/,10,10,2009-04-09
3,3,/title/tt0113142/,This film is the first of the newer Gamera mon...,8.0,Good Introduction to Revive Gamera Trilogy.,/user/ur0437174/,6,6,2007-06-13
4,4,/title/tt0113142/,"This is a good, definitely 90ish monster movie...",8.0,good 90ish monster movie,/user/ur5156469/,7,7,2005-12-04
...,...,...,...,...,...,...,...,...,...
204813,53,/title/tt0118691/,The Nanny is a great sitcom and I love it. Bu...,,The Nanny goes to Eastern Europe,/user/ur0398112/,0,2,1999-07-28
204814,54,/title/tt0118691/,Go into this movie for what it is and it's hum...,10.0,Forgot how funny this movie actually was,/user/ur119571184/,0,1,2020-06-10
204815,55,/title/tt0118691/,"Seen some poor reviews, so what, I love it to ...",10.0,Love it,/user/ur136147672/,0,0,2021-10-23
204816,56,/title/tt0118691/,Drescher has an excellent performance!!! And D...,8.0,Very kind movie!!!,/user/ur19378453/,0,0,2022-01-29


In [11]:
num_users, num_titles = user_item_matrix.shape
num_total_elements = num_users * num_titles
num_nonempty_elements = (~user_item_matrix.isna()).sum().sum()
sparsity = 100 * (num_total_elements - num_nonempty_elements) / num_total_elements
print(sparsity)

99.88075755787395


In [None]:
memusage_f32 = user_item_matrix.memory_usage(deep=True).sum() / 1024 / 1024
memusage_f16 = user_item_matrix.astype('float16').memory_usage(deep=True).sum() / 1024 / 1024
print(
    f'Float32 memory footprint: {memusage_f32:.1f} Mb',
    f'Float16 memory footprint: {memusage_f16:.1f} Mb',
    sep='\n'
)

In [23]:
vals = metadata.nlargest(20, 'rating').index
relevant_metadata = metadata[metadata.index.isin(vals)]
relevant_metadata

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0050083/,Crime,12 Angry Men,https://m.media-amazon.com/images/M/MV5BMWU4N2...,"{'1': '/name/nm0000020', '2': '/name/nm0002011...","{'1': '/title/tt0108052/', '2': '/title/tt0111...",1,9.0,771000,1900.0,152.0,...,English,Orion-Nova Productions,,,"New York County Courthouse - 60 Centre Street,...",USA,"$350,000 (estimated)",,,$955
/title/tt0068646/,Crime,The Godfather,https://m.media-amazon.com/images/M/MV5BM2MyNj...,"{'1': '/name/nm0000008', '2': '/name/nm0000199...","{'1': '/title/tt0071562/', '2': '/title/tt0111...",1,9.2,1800000,5000.0,197.0,...,English,Paramount Pictures,Albert S. Ruddy Productions,Alfran Productions,"Forza d'Agrò, Messina, Sicily, Italy",Italy,"$6,000,000 (estimated)","$136,381,073","$302,393","$250,341,816"
/title/tt0071562/,Crime,The Godfather: Part II,https://m.media-amazon.com/images/M/MV5BMWMwMG...,"{'1': '/name/nm0000199', '2': '/name/nm0000134...","{'1': '/title/tt0068646/', '2': '/title/tt0099...",1,9.0,1200000,1200.0,187.0,...,English,Paramount Pictures,The Coppola Company,American Zoetrope,"Kaiser Estate, 4000 W Lake Blvd, Homewood, Lak...",USA,"$13,000,000 (estimated)","$47,834,595","$171,417","$47,961,919"
/title/tt0108052/,Biography,Schindler's List,https://m.media-amazon.com/images/M/MV5BNDE4OT...,"{'1': '/name/nm0000553', '2': '/name/nm0000146...","{'1': '/title/tt0110912/', '2': '/title/tt0111...",1,9.0,1300000,2100.0,173.0,...,English,Universal Pictures,Amblin Entertainment,,"Auschwitz-Birkenau Concentration Camp, Oswieci...",Poland,"$22,000,000 (estimated)","$96,898,818","$656,636","$322,161,245"
/title/tt0111161/,Drama,The Shawshank Redemption,https://m.media-amazon.com/images/M/MV5BMDFkYT...,"{'1': '/name/nm0000209', '2': '/name/nm0000151...","{'1': '/title/tt0468569/', '2': '/title/tt0109...",1,9.3,2600000,10200.0,164.0,...,English,Castle Rock Entertainment,,,"127A Smithfield Road, Frederiksted, Virgin Isl...",Islands,"$25,000,000 (estimated)","$28,767,189","$727,327","$28,884,504"
/title/tt0167260/,Action,The Lord of the Rings: The Return of the King,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,"{'1': '/name/nm0000704', '2': '/name/nm0001557...","{'1': '/title/tt0167261/', '2': '/title/tt0120...",1,9.0,1800000,4000.0,342.0,...,English,New Line Cinema,WingNut Films,The Saul Zaentz Company,"Hinuera Valley, Matamata, Waikato, New Zealand",Zealand,"$94,000,000 (estimated)","$378,251,207","$72,629,713","$1,146,436,214"
/title/tt0232152/,Drama,Nagara Haavu,https://m.media-amazon.com/images/M/MV5BMTEyYW...,"{'1': '/name/nm0024259', '2': '/name/nm0033175...","{'1': '/title/tt1339248/', '2': '/title/tt0315...",1,9.0,1100,10.0,1.0,...,Kannada,Sri Eswari Productions,,,,,,,,
/title/tt0249795/,Comedy,Mayabazar,https://m.media-amazon.com/images/M/MV5BMmQwNj...,"{'1': '/name/nm0004417', '2': '/name/nm0710036...","{'1': '/title/tt0311594/', '2': '/title/tt0249...",1,9.1,5000,34.0,1.0,...,Telugu,Vijaya Pictures,,,"Vauhini Studios, Chennai, Tamil Nadu, India",India,,,,
/title/tt0252487/,Comedy,The Chaos Class,https://m.media-amazon.com/images/M/MV5BOWI4NG...,"{'1': '/name/nm0839017', '2': '/name/nm0654805...","{'1': '/title/tt0252488/', '2': '/title/tt0252...",1,9.3,40000,72.0,1.0,...,Turkish,Arzu Film,,,"Camlica, Istanbul, Turkey",Turkey,,,,
/title/tt0259534/,Animation,Ramayana: The Legend of Prince Rama,https://m.media-amazon.com/images/M/MV5BOTk4NG...,"{'1': '/name/nm4468244', '2': '/name/nm1336047...","{'1': '/title/tt0488836/', '2': '/title/tt1674...",1,9.1,4800,29.0,,...,Hindi,Nippon Ramayana Film Co.,TEM,,,,,,,


In [34]:
relevant_metadata[['genre_1', 'genre_2', 'genre_3']].apply(lambda x: ', '.join(x) if x else '', axis=1)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [24]:
relevant_metadata.columns

Index(['main_genre', 'original_title', 'poster_url', 'actors',
       'imdb_recommendations', 'reviews_collected_flg', 'rating', 'num_votes',
       'user_review_num', 'critic_review_num', 'metascore', 'genre_1',
       'genre_2', 'genre_3', 'release_date', 'also_known_as', 'runtime',
       'country_of_origin_1', 'country_of_origin_2', 'country_of_origin_3',
       'original_language', 'production_company_1', 'production_company_2',
       'production_company_3', 'filming_location', 'filming_country', 'budget',
       'boxoffice_gross_domestic', 'boxoffice_gross_opening',
       'boxoffice_gross_worldwide'],
      dtype='object')

In [43]:
metadata['release_date'].dt.strftime('%B %Y')

/title/tt0002130/        July 1911
/title/tt0003740/         May 1914
/title/tt0004635/    February 1914
/title/tt0004707/    December 1914
/title/tt0004972/       March 1915
                         ...      
/title/tt9898858/       April 2020
/title/tt9900782/     October 2019
/title/tt9907782/    February 2022
/title/tt9911196/    February 2020
/title/tt9916362/     October 2020
Name: release_date, Length: 23583, dtype: object

In [36]:
metadata['genre_1']

/title/tt0002130/    Adventure
/title/tt0003740/    Adventure
/title/tt0004635/       Action
/title/tt0004707/       Comedy
/title/tt0004972/        Drama
                       ...    
/title/tt9898858/       Action
/title/tt9900782/       Action
/title/tt9907782/      Fantasy
/title/tt9911196/       Comedy
/title/tt9916362/        Drama
Name: genre_1, Length: 23583, dtype: object

In [None]:
relevant_metadata[['original_title', 'poster_url', 'rating', '']]

In [85]:
from abc import ABCMeta
from typing import List, Dict, Any


class BaseRecommender(metaclass=ABCMeta):
    RELEVANT_METADATA = [
        'original_title',
        'genre',
        'release_date',
        'country_of_origin_1',
        'production_company_1',
        'rating',
        'poster_url'
    ]
    def __init__(self, metadata: pd.DataFrame, **model_params):
        self.metadata = metadata

    def _recommend(self, **params) -> List[str]:
        pass

    def _measure_relevance(self, **params) -> List[float]:
        pass

    def _format_recommendations(
        self,
        identifiers: List[str],
        relevance: List[float]
    ) -> Dict[str, Any]:
        """
        Returns relevant metadata for each predicted movie.
        """
        metadata_ = self.metadata[self.metadata.index.isin(identifiers)]
        metadata_['genre'] = (
            metadata_[['genre_1', 'genre_2', 'genre_3']]
            .apply(lambda x: ', '.join(x.dropna()), axis=1)
        )
        metadata_['release_date'] = (
            metadata_['release_date']
            .dt.strftime('%B %Y')
        )
        metadata_['rating'] = metadata_['rating'].round(1)
        relevant_metadata = metadata_[self.RELEVANT_METADATA]
        relevant_metadata['relevance'] = relevance
        return (
            relevant_metadata
            .sort_values('relevance', ascending=False)
            .drop('relevance', axis=1)
            .to_dict(orient='records')
        )
    
    def recommend(self, **params) -> Dict[str, Any]:
        raw_recommendations = self._recommend(**params)
        relevance = self._measure_relevance(**params)
        return self._format_recommendations(raw_recommendations, relevance)


class PopularRecommender(BaseRecommender):
    def __init__(self, metadata: pd.DataFrame, top_n: int):
        BaseRecommender.__init__(self, metadata)
        self.popular_titles = (
            self.metadata
            .query('num_votes > 50000')
            .nlargest(top_n, 'rating')
        )

    def _recommend(self):
        return self.popular_titles.index

    def _measure_relevance(self):
        return self.popular_titles['rating']


class IMDBRecommender(BaseRecommender):
    pass


a = PopularRecommender(metadata, 10)


In [86]:
a.recommend()

[{'original_title': 'The Shawshank Redemption',
  'genre': 'Drama',
  'release_date': 'October 1994',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Castle Rock Entertainment',
  'rating': 9.3,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_QL75_UX190_CR0,0,190,281_.jpg'},
 {'original_title': 'The Godfather',
  'genre': 'Crime, Drama',
  'release_date': 'March 1972',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Paramount Pictures',
  'rating': 9.2,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BM2MyNjYxNmUtYTAwNi00MTYxLWJmNWYtYzZlODY3ZTk3OTFlXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_QL75_UY281_CR4,0,190,281_.jpg'},
 {'original_title': '12 Angry Men',
  'genre': 'Crime, Drama',
  'release_date': 'April 1957',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Orion-Nova Productions',
  'rating': 9.0,
  'poster_url': 'https://m.media-