In [1]:
import os
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings('ignore')

AWS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_S3_BUCKET = os.getenv('AWS_S3_BUCKET')
REVIEWS_FILE_TEMPLATE = 'reviews/reviews_partition_{}.parquet'

storage_options = {
    'key': AWS_KEY,
    'secret': AWS_SECRET_KEY
}

reviews_files = [
    os.path.join(
        's3://',
        AWS_S3_BUCKET,
        REVIEWS_FILE_TEMPLATE.format(partition + 1)
    )
    for partition in range(20)
]
metadata_file = os.path.join('s3://', AWS_S3_BUCKET, 'metadata/metadata.json')
metadata_proc_file = os.path.join('s3://', AWS_S3_BUCKET, 'metadata/metadata_features.json')

%load_ext autoreload
%autoreload 2

In [2]:
metadata = pd.read_json(metadata_proc_file, storage_options=storage_options, orient='index')
metadata = metadata.astype({'release_date': 'datetime64[D]'})
metadata.head()

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0002130/,Adventure,Dante's Inferno,https://m.media-amazon.com/images/M/MV5BNjU1ND...,"{'1': '/name/nm0660139', '2': '/name/nm0685283...","{'1': '/title/tt0003740/', '2': '/title/tt0001...",1,7.0,2900,36.0,13.0,...,Italian,Milano Film,SAFFI-Comerio,,"Bovisa, Milano, Lombardia, Italy",Italy,,,,
/title/tt0003740/,Adventure,Cabiria,https://m.media-amazon.com/images/M/MV5BZmVjNz...,"{'1': '/name/nm0021935', '2': '/name/nm0702894...","{'1': '/title/tt0006864/', '2': '/title/tt0009...",1,7.1,3600,35.0,26.0,...,Italian,Itala Film,,,"FERT Studios, Turin, Piedmont, Italy",Italy,"ITL 1,000,000 (estimated)",,,
/title/tt0004635/,Action,The Squaw Man,https://m.media-amazon.com/images/M/MV5BMjMwOD...,"{'1': '/name/nm0267914', '2': '/name/nm0758457...","{'1': '/title/tt0014532/', '2': '/title/tt0006...",1,5.7,979,9.0,4.0,...,,Jesse L. Lasky Feature Play Company,,,Hollywood Heritage Museum - 2100 North Highlan...,USA,"$20,000 (estimated)",,,
/title/tt0004707/,Comedy,Tillie's Punctured Romance,https://m.media-amazon.com/images/M/MV5BMDZjOW...,"{'1': '/name/nm0000122', '2': '/name/nm0237597...","{'1': '/title/tt0005074/', '2': '/title/tt0006...",1,6.3,3500,41.0,19.0,...,,Keystone Film Company,,,"Sans Souci Castle, Los Angeles, California, USA",USA,"$50,000 (estimated)",,,
/title/tt0004972/,Drama,The Birth of a Nation,https://m.media-amazon.com/images/M/MV5BNWZlNj...,"{'1': '/name/nm0001273', '2': '/name/nm0550615...","{'1': '/title/tt0006864/', '2': '/title/tt0009...",1,6.2,25000,379.0,79.0,...,,David W. Griffith Corp.,Epoch Producing Corporation,,"Calexico, California, USA",USA,"$110,000 (estimated)",,,


In [4]:
from tqdm.notebook import tqdm

reviews_partitions = []
for path in tqdm(reviews_files):
    reviews_partition = pd.read_parquet(path, storage_options=storage_options)
    reviews_partitions.append(reviews_partition)
    break
reviews = pd.concat(reviews_partitions)

  0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
user_item_matrix = (
    reviews
    [['id', 'author', 'rating']]
    .drop_duplicates()
    .pivot(index='author', columns='id', values='rating')
)
memusage = user_item_matrix.memory_usage(deep=True).sum() / 1024 / 1024
print(f'{memusage:.2f} Mb')

601.00 Mb


In [10]:
reviews

Unnamed: 0,index,id,text,rating,title,author,upvotes,total_votes,review_date
0,0,/title/tt0113142/,I liked this movie the best out of all the '90...,9.0,A modern day classic kaijyu movie,/user/ur3225430/,24,26,2005-10-10
1,1,/title/tt0113142/,When Gamera first appeared in Japanese theater...,10.0,a magnificently entertaining monster movie; t...,/user/ur6321000/,13,13,2009-11-23
2,2,/title/tt0113142/,"Sooner than I expected, GAMERA Trilogy COMPLET...",9.0,Didn't Disappoint !!,/user/ur20916867/,10,10,2009-04-09
3,3,/title/tt0113142/,This film is the first of the newer Gamera mon...,8.0,Good Introduction to Revive Gamera Trilogy.,/user/ur0437174/,6,6,2007-06-13
4,4,/title/tt0113142/,"This is a good, definitely 90ish monster movie...",8.0,good 90ish monster movie,/user/ur5156469/,7,7,2005-12-04
...,...,...,...,...,...,...,...,...,...
204813,53,/title/tt0118691/,The Nanny is a great sitcom and I love it. Bu...,,The Nanny goes to Eastern Europe,/user/ur0398112/,0,2,1999-07-28
204814,54,/title/tt0118691/,Go into this movie for what it is and it's hum...,10.0,Forgot how funny this movie actually was,/user/ur119571184/,0,1,2020-06-10
204815,55,/title/tt0118691/,"Seen some poor reviews, so what, I love it to ...",10.0,Love it,/user/ur136147672/,0,0,2021-10-23
204816,56,/title/tt0118691/,Drescher has an excellent performance!!! And D...,8.0,Very kind movie!!!,/user/ur19378453/,0,0,2022-01-29


In [11]:
num_users, num_titles = user_item_matrix.shape
num_total_elements = num_users * num_titles
num_nonempty_elements = (~user_item_matrix.isna()).sum().sum()
sparsity = 100 * (num_total_elements - num_nonempty_elements) / num_total_elements
print(sparsity)

99.88075755787395


In [None]:
memusage_f32 = user_item_matrix.memory_usage(deep=True).sum() / 1024 / 1024
memusage_f16 = user_item_matrix.astype('float16').memory_usage(deep=True).sum() / 1024 / 1024
print(
    f'Float32 memory footprint: {memusage_f32:.1f} Mb',
    f'Float16 memory footprint: {memusage_f16:.1f} Mb',
    sep='\n'
)

In [23]:
vals = metadata.nlargest(20, 'rating').index
relevant_metadata = metadata[metadata.index.isin(vals)]
relevant_metadata

Unnamed: 0,main_genre,original_title,poster_url,actors,imdb_recommendations,reviews_collected_flg,rating,num_votes,user_review_num,critic_review_num,...,original_language,production_company_1,production_company_2,production_company_3,filming_location,filming_country,budget,boxoffice_gross_domestic,boxoffice_gross_opening,boxoffice_gross_worldwide
/title/tt0050083/,Crime,12 Angry Men,https://m.media-amazon.com/images/M/MV5BMWU4N2...,"{'1': '/name/nm0000020', '2': '/name/nm0002011...","{'1': '/title/tt0108052/', '2': '/title/tt0111...",1,9.0,771000,1900.0,152.0,...,English,Orion-Nova Productions,,,"New York County Courthouse - 60 Centre Street,...",USA,"$350,000 (estimated)",,,$955
/title/tt0068646/,Crime,The Godfather,https://m.media-amazon.com/images/M/MV5BM2MyNj...,"{'1': '/name/nm0000008', '2': '/name/nm0000199...","{'1': '/title/tt0071562/', '2': '/title/tt0111...",1,9.2,1800000,5000.0,197.0,...,English,Paramount Pictures,Albert S. Ruddy Productions,Alfran Productions,"Forza d'Agrò, Messina, Sicily, Italy",Italy,"$6,000,000 (estimated)","$136,381,073","$302,393","$250,341,816"
/title/tt0071562/,Crime,The Godfather: Part II,https://m.media-amazon.com/images/M/MV5BMWMwMG...,"{'1': '/name/nm0000199', '2': '/name/nm0000134...","{'1': '/title/tt0068646/', '2': '/title/tt0099...",1,9.0,1200000,1200.0,187.0,...,English,Paramount Pictures,The Coppola Company,American Zoetrope,"Kaiser Estate, 4000 W Lake Blvd, Homewood, Lak...",USA,"$13,000,000 (estimated)","$47,834,595","$171,417","$47,961,919"
/title/tt0108052/,Biography,Schindler's List,https://m.media-amazon.com/images/M/MV5BNDE4OT...,"{'1': '/name/nm0000553', '2': '/name/nm0000146...","{'1': '/title/tt0110912/', '2': '/title/tt0111...",1,9.0,1300000,2100.0,173.0,...,English,Universal Pictures,Amblin Entertainment,,"Auschwitz-Birkenau Concentration Camp, Oswieci...",Poland,"$22,000,000 (estimated)","$96,898,818","$656,636","$322,161,245"
/title/tt0111161/,Drama,The Shawshank Redemption,https://m.media-amazon.com/images/M/MV5BMDFkYT...,"{'1': '/name/nm0000209', '2': '/name/nm0000151...","{'1': '/title/tt0468569/', '2': '/title/tt0109...",1,9.3,2600000,10200.0,164.0,...,English,Castle Rock Entertainment,,,"127A Smithfield Road, Frederiksted, Virgin Isl...",Islands,"$25,000,000 (estimated)","$28,767,189","$727,327","$28,884,504"
/title/tt0167260/,Action,The Lord of the Rings: The Return of the King,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,"{'1': '/name/nm0000704', '2': '/name/nm0001557...","{'1': '/title/tt0167261/', '2': '/title/tt0120...",1,9.0,1800000,4000.0,342.0,...,English,New Line Cinema,WingNut Films,The Saul Zaentz Company,"Hinuera Valley, Matamata, Waikato, New Zealand",Zealand,"$94,000,000 (estimated)","$378,251,207","$72,629,713","$1,146,436,214"
/title/tt0232152/,Drama,Nagara Haavu,https://m.media-amazon.com/images/M/MV5BMTEyYW...,"{'1': '/name/nm0024259', '2': '/name/nm0033175...","{'1': '/title/tt1339248/', '2': '/title/tt0315...",1,9.0,1100,10.0,1.0,...,Kannada,Sri Eswari Productions,,,,,,,,
/title/tt0249795/,Comedy,Mayabazar,https://m.media-amazon.com/images/M/MV5BMmQwNj...,"{'1': '/name/nm0004417', '2': '/name/nm0710036...","{'1': '/title/tt0311594/', '2': '/title/tt0249...",1,9.1,5000,34.0,1.0,...,Telugu,Vijaya Pictures,,,"Vauhini Studios, Chennai, Tamil Nadu, India",India,,,,
/title/tt0252487/,Comedy,The Chaos Class,https://m.media-amazon.com/images/M/MV5BOWI4NG...,"{'1': '/name/nm0839017', '2': '/name/nm0654805...","{'1': '/title/tt0252488/', '2': '/title/tt0252...",1,9.3,40000,72.0,1.0,...,Turkish,Arzu Film,,,"Camlica, Istanbul, Turkey",Turkey,,,,
/title/tt0259534/,Animation,Ramayana: The Legend of Prince Rama,https://m.media-amazon.com/images/M/MV5BOTk4NG...,"{'1': '/name/nm4468244', '2': '/name/nm1336047...","{'1': '/title/tt0488836/', '2': '/title/tt1674...",1,9.1,4800,29.0,,...,Hindi,Nippon Ramayana Film Co.,TEM,,,,,,,


In [34]:
relevant_metadata[['genre_1', 'genre_2', 'genre_3']].apply(lambda x: ', '.join(x) if x else '', axis=1)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [24]:
relevant_metadata.columns

Index(['main_genre', 'original_title', 'poster_url', 'actors',
       'imdb_recommendations', 'reviews_collected_flg', 'rating', 'num_votes',
       'user_review_num', 'critic_review_num', 'metascore', 'genre_1',
       'genre_2', 'genre_3', 'release_date', 'also_known_as', 'runtime',
       'country_of_origin_1', 'country_of_origin_2', 'country_of_origin_3',
       'original_language', 'production_company_1', 'production_company_2',
       'production_company_3', 'filming_location', 'filming_country', 'budget',
       'boxoffice_gross_domestic', 'boxoffice_gross_opening',
       'boxoffice_gross_worldwide'],
      dtype='object')

In [43]:
metadata['release_date'].dt.strftime('%B %Y')

/title/tt0002130/        July 1911
/title/tt0003740/         May 1914
/title/tt0004635/    February 1914
/title/tt0004707/    December 1914
/title/tt0004972/       March 1915
                         ...      
/title/tt9898858/       April 2020
/title/tt9900782/     October 2019
/title/tt9907782/    February 2022
/title/tt9911196/    February 2020
/title/tt9916362/     October 2020
Name: release_date, Length: 23583, dtype: object

In [36]:
metadata['genre_1']

/title/tt0002130/    Adventure
/title/tt0003740/    Adventure
/title/tt0004635/       Action
/title/tt0004707/       Comedy
/title/tt0004972/        Drama
                       ...    
/title/tt9898858/       Action
/title/tt9900782/       Action
/title/tt9907782/      Fantasy
/title/tt9911196/       Comedy
/title/tt9916362/        Drama
Name: genre_1, Length: 23583, dtype: object

In [None]:
relevant_metadata[['original_title', 'poster_url', 'rating', '']]

In [204]:
from abc import ABCMeta
from typing import List, Dict, Any
from itertools import chain


class BaseRecommender(metaclass=ABCMeta):
    RELEVANT_METADATA = [
        'original_title',
        'genre',
        'release_date',
        'country_of_origin_1',
        'production_company_1',
        'rating',
        'poster_url'
    ]
    def __init__(self, metadata: pd.DataFrame, **model_params):
        self.metadata = metadata

    def _recommend(self, **user_input) -> Dict[str, float]:
        pass

    def _format_recommendations(
        self,
        identifiers: Dict[str, float]
    ) -> Dict[str, Any]:
        """
        Returns relevant metadata for each predicted movie.
        """
        metadata_ = self.metadata[self.metadata.index.isin(identifiers)]
        metadata_.loc[identifiers.keys(), 'relevance'] = list(identifiers.values())
        metadata_['genre'] = (
            metadata_[['genre_1', 'genre_2', 'genre_3']]
            .apply(lambda x: ', '.join(x.dropna()), axis=1)
        )
        metadata_['release_date'] = (
            metadata_['release_date']
            .dt.strftime('%B %Y')
        )
        metadata_['rating'] = metadata_['rating'].round(1)

        return (
            metadata_
            .sort_values('relevance', ascending=False)
            [self.RELEVANT_METADATA]
            .to_dict(orient='records')
        )
    
    def recommend(
        self,
        user_liked_movies: List[str] = None,
        user_preferences: Dict[str, Any] = None
    ) -> Dict[str, Any]:
        user_input = {
            'user_liked_movies': user_liked_movies,
            'user_preferences': user_preferences
        }
        raw_recommendations = self._recommend(**user_input)
        return self._format_recommendations(raw_recommendations)


class PopularRecommender(BaseRecommender):
    def __init__(self, metadata: pd.DataFrame, top_n: int):
        BaseRecommender.__init__(self, metadata)
        self.popular_titles = (
            self.metadata
            .query('num_votes > 50000')
            .nlargest(top_n, 'rating')
        )

    def _recommend(self, **user_input) -> Dict[str, float]:
        return self.popular_titles['rating'].to_dict()


class IMDBRecommender(BaseRecommender):
    def __init__(self, metadata: pd.DataFrame, top_n: int):
        BaseRecommender.__init__(self, metadata)
        self.metadata['imdb_recommendations_set'] = (
            self.metadata['imdb_recommendations']
            .apply(lambda x: set(x.values()))
        )
        self.top_n = top_n
        
    def _recommend(self, user_liked_movies: List[str], **user_input)\
            -> List[str]:
        # metadata about movies that user liked
        user_liked_movies_metadata = self.metadata[
            self.metadata.index.isin(user_liked_movies)
        ]
        # set of recommendations for each liked movie
        all_imdb_recommendations = (
            user_liked_movies_metadata['imdb_recommendations_set']
            .tolist()
        )
        # metadata for unique recommendations for all liked movies
        # recommend only movies with highest rating
        unique_recommendations_metadata = self.metadata[
            self.metadata.index.isin(set(chain(*all_imdb_recommendations)))
        ]
        return (
            unique_recommendations_metadata
            .nlargest(self.top_n, 'rating')
            ['rating']
            .to_dict()
        )

In [206]:
liked_movies = np.random.choice(metadata.index, 5).tolist()

In [209]:
%%time
poprec = PopularRecommender(metadata, 10)
poprec.recommend()[:2]

CPU times: user 15 ms, sys: 674 µs, total: 15.7 ms
Wall time: 15.1 ms


[{'original_title': 'The Shawshank Redemption',
  'genre': 'Drama',
  'release_date': 'October 1994',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Castle Rock Entertainment',
  'rating': 9.3,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_QL75_UX190_CR0,0,190,281_.jpg'},
 {'original_title': 'The Godfather',
  'genre': 'Crime, Drama',
  'release_date': 'March 1972',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Paramount Pictures',
  'rating': 9.2,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BM2MyNjYxNmUtYTAwNi00MTYxLWJmNWYtYzZlODY3ZTk3OTFlXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_QL75_UY281_CR4,0,190,281_.jpg'}]

In [208]:
%%time
imdbrec = IMDBRecommender(metadata, 10)
imdbrec.recommend(liked_movies)[:2]

CPU times: user 52.1 ms, sys: 1.17 ms, total: 53.2 ms
Wall time: 52.5 ms


[{'original_title': 'Rocky',
  'genre': 'Drama, Sport',
  'release_date': 'December 1976',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Chartoff-Winkler Productions',
  'rating': 8.1,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BNTBkMjg2MjYtYTZjOS00ODQ0LTg0MDEtM2FiNmJmOGU1NGEwXkEyXkFqcGdeQXVyMjUzOTY1NTc@._V1_QL75_UX190_CR0,4,190,281_.jpg'},
 {'original_title': 'First Blood',
  'genre': 'Action, Adventure, Thriller',
  'release_date': 'October 1982',
  'country_of_origin_1': 'United States',
  'production_company_1': 'Anabasis N.V.',
  'rating': 7.7,
  'poster_url': 'https://m.media-amazon.com/images/M/MV5BODBmOWU2YWMtZGUzZi00YzRhLWJjNDAtYTUwNWVkNDcyZmU5XkEyXkFqcGdeQXVyNDk3NzU2MTQ@._V1_QL75_UX190_CR0,2,190,281_.jpg'}]

In [211]:
metadata.columns

Index(['main_genre', 'original_title', 'poster_url', 'actors',
       'imdb_recommendations', 'reviews_collected_flg', 'rating', 'num_votes',
       'user_review_num', 'critic_review_num', 'metascore', 'genre_1',
       'genre_2', 'genre_3', 'release_date', 'also_known_as', 'runtime',
       'country_of_origin_1', 'country_of_origin_2', 'country_of_origin_3',
       'original_language', 'production_company_1', 'production_company_2',
       'production_company_3', 'filming_location', 'filming_country', 'budget',
       'boxoffice_gross_domestic', 'boxoffice_gross_opening',
       'boxoffice_gross_worldwide', 'imdb_recommendations_set'],
      dtype='object')

In [218]:
metadata.groupby('original_language')['main_genre'].count().sort_values(ascending=False)[:10]

original_language
English     15589
French       1122
Hindi        1066
Japanese      633
Italian       515
Spanish       500
Turkish       452
German        368
Tamil         268
Korean        244
Name: main_genre, dtype: int64

In [212]:
metadata['genre_1']

/title/tt0002130/    Adventure
/title/tt0003740/    Adventure
/title/tt0004635/       Action
/title/tt0004707/       Comedy
/title/tt0004972/        Drama
                       ...    
/title/tt9898858/       Action
/title/tt9900782/       Action
/title/tt9907782/      Fantasy
/title/tt9911196/       Comedy
/title/tt9916362/        Drama
Name: genre_1, Length: 23583, dtype: object

In [3]:
top_n_actors = 5
actors = [
    (list(item.values()) + ['']*(top_n_actors - len(item)))[:top_n_actors]
    for item in metadata['actors']
]
actors_df = pd.DataFrame.from_records(actors)

In [5]:
actors_df

Unnamed: 0,0,1,2,3,4
0,/name/nm0660139,/name/nm0685283,/name/nm0209738,/name/nm3942815,/name/nm1375863
1,/name/nm0021935,/name/nm0702894,/name/nm0656034,/name/nm0146028,/name/nm0544842
2,/name/nm0267914,/name/nm0758457,/name/nm0455612,/name/nm0277317,/name/nm0298243
3,/name/nm0000122,/name/nm0237597,/name/nm0635667,/name/nm0841501,/name/nm0071658
4,/name/nm0001273,/name/nm0550615,/name/nm0910400,/name/nm0178270,/name/nm0017488
...,...,...,...,...,...
23578,/name/nm1159180,/name/nm0378245,/name/nm10067359,/name/nm2365811,/name/nm0498165
23579,/name/nm1912683,/name/nm1230844,/name/nm10183124,/name/nm6998719,/name/nm6649988
23580,/name/nm2933542,/name/nm0717709,/name/nm0677944,/name/nm3646923,/name/nm0079451
23581,/name/nm0277932,/name/nm0824373,/name/nm3362584,/name/nm2558112,/name/nm10877188


In [7]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
main_actors_categories = 20
actor_encoder = OneHotEncoder(max_categories=main_actors_categories)
actor_encoder.fit(actors_df[0].to_frame())

In [56]:
def get_actor_categories(
    actors: pd.Series,
    max_categories: int,
    prefix: str = ''
) -> pd.DataFrame:
    actor_encoder = OneHotEncoder(max_categories=max_categories)
    actor_encoder.fit(actors.to_frame())

    actor_encoded = pd.DataFrame(
        data=actor_encoder.transform(actors.to_frame()).todense(),
        columns=actor_encoder.get_feature_names(),
        dtype=bool
    )
    actor_encoded.columns = actor_encoded.columns.str.removeprefix('x0_')
    return actor_encoded.add_prefix(prefix)

get_actor_categories(actors_df[0], 20, 'first_')

Unnamed: 0,first_/name/nm0000044,first_/name/nm0000078,first_/name/nm0000115,first_/name/nm0000134,first_/name/nm0000142,first_/name/nm0000158,first_/name/nm0000230,first_/name/nm0000243,first_/name/nm0000246,first_/name/nm0000323,first_/name/nm0000329,first_/name/nm0000821,first_/name/nm0001191,first_/name/nm0006795,first_/name/nm0222426,first_/name/nm0451321,first_/name/nm0474774,first_/name/nm0482320,first_/name/nm0839017,first_infrequent_sklearn
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
23579,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
23580,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
23581,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [43]:
actor_encoder.get_feature_names()

array(['x0_/name/nm0000044', 'x0_/name/nm0000078', 'x0_/name/nm0000115',
       'x0_/name/nm0000134', 'x0_/name/nm0000142', 'x0_/name/nm0000158',
       'x0_/name/nm0000230', 'x0_/name/nm0000243', 'x0_/name/nm0000246',
       'x0_/name/nm0000323', 'x0_/name/nm0000329', 'x0_/name/nm0000821',
       'x0_/name/nm0001191', 'x0_/name/nm0006795', 'x0_/name/nm0222426',
       'x0_/name/nm0451321', 'x0_/name/nm0474774', 'x0_/name/nm0482320',
       'x0_/name/nm0839017', 'x0_infrequent_sklearn'], dtype=object)

In [57]:
metadata.columns

Index(['main_genre', 'original_title', 'poster_url', 'actors',
       'imdb_recommendations', 'reviews_collected_flg', 'rating', 'num_votes',
       'user_review_num', 'critic_review_num', 'metascore', 'genre_1',
       'genre_2', 'genre_3', 'release_date', 'also_known_as', 'runtime',
       'country_of_origin_1', 'country_of_origin_2', 'country_of_origin_3',
       'original_language', 'production_company_1', 'production_company_2',
       'production_company_3', 'filming_location', 'filming_country', 'budget',
       'boxoffice_gross_domestic', 'boxoffice_gross_opening',
       'boxoffice_gross_worldwide'],
      dtype='object')

In [490]:
from typing import List, Any


def find_boxoffice_quantile_range(x, q):
    for i, j, l, r in zip(q, q[1:], q.index, q.index[1:]):
        if i <= x < j:
            return f'boxoffice_from_q{l}_to_q{r}'
    return 'boxoffice_unknown_q'


def concat_genres(row):
    genre = [row['genre_1']]
    if row['genre_2']:
        genre += [row['genre_2']]
    if row['genre_3']:
        genre += [row['genre_3']]
    return ', '.join(sorted(genre))


def get_runtime_bin_labels(bins: List[float]) -> List[str]:
    return [
        f'runtime_from_q{100*l:.0f}_to_q{100*r:.0f}'
        for l, r in zip(bins, bins[1:])
    ]



class FeatureTransformer:
    def __init__(
        self,
        min_num_votes: int = None,
        n_actor_feat: int = 10,
        n_genre_feat: int = 20,
        n_prod_comp_feat: int = 20,
        n_country_feat: int = 20,
        n_lang_feat: int = 20,
        boxoffice_quantiles: List[float] = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1],
        runtime_quantiles: List[float] = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1],
        rating_bins: List[int] = [0, 4, 6, 8, 10]
    ):
        self.min_num_votes = min_num_votes
        self.n_actor_feat = n_actor_feat
        self.n_genre_feat = n_genre_feat
        self.n_prod_comp_feat = n_prod_comp_feat
        self.n_country_feat = n_country_feat
        self.n_lang_feat = n_lang_feat
        self.boxoffice_quantiles = boxoffice_quantiles
        self.runtime_quantiles = runtime_quantiles
        self.rating_bins = rating_bins

    @staticmethod
    def get_genre_features(
        metadata: pd.DataFrame,
        n_categories: int
    ) -> pd.DataFrame:
        genre_comb = (
            metadata[['genre_1', 'genre_2', 'genre_3']]
            .apply(concat_genres, axis=1)
            .to_frame()
        )
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(genre_comb)
        features = pd.DataFrame(
            data=encoder.transform(genre_comb).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        return features

    @staticmethod
    def get_release_decade_features(metadata: pd.DataFrame) -> pd.DataFrame:
        release_decade = (
            metadata['release_date']
            .dt.year.round(-1)
            .to_frame()
        )
        encoder = OneHotEncoder()
        encoder.fit(release_decade)
        features = pd.DataFrame(
            data=encoder.transform(release_decade).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns = features.columns.str[:-2]
        features = features.add_prefix('released_in_').add_suffix('s')
        features.columns.values[-1] = 'unknown_release_decade'
        return features.astype('int8')

    @staticmethod
    def get_actor_features(
        metadata: pd.DataFrame,
        n_categories: int
    ) -> pd.DataFrame:
        top_n = 3
        actors = [
            (list(item.values()) + ['']*(top_n - len(item)))[:top_n]
            for item in metadata['actors']
        ]
        actors = pd.DataFrame.from_records(actors, index=metadata.index)

        encoder = OneHotEncoder(max_categories=n_categories)

        features = []
        for i in range(top_n):
            encoder.fit(actors[i].to_frame())
            feat = pd.DataFrame(
                data=encoder.transform(actors[i].to_frame()).todense(),
                index=metadata.index,
                columns=encoder.get_feature_names()
            ).drop('x0_infrequent_sklearn', axis=1)
            features.append(feat)

        features = pd.concat(features, axis=1)
        features = features[features.columns.unique()]
        features['another'] = features.sum(axis=1) == 0
        features.columns = features.columns.str.removeprefix('x0_/name/')
        return (
            features
            .astype('int8')
            .add_prefix('actor_')
            .drop(columns='actor_x0_')
        )
        
    @staticmethod
    def get_country_features(
        metadata: pd.DataFrame,
        n_categories: int
    ) -> pd.DataFrame:
        country = metadata['country_of_origin_1'].to_frame()
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(country)
        features = pd.DataFrame(
            data=encoder.transform(country).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns.values[-1] = 'infrequent_country'
        return features.add_prefix('originated_in_').astype('int8')

    @staticmethod
    def get_language_features(metadata: pd.DataFrame, n_categories: int) -> pd.DataFrame:
        language = metadata['original_language'].to_frame()
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(language)
        features = pd.DataFrame(
            data=encoder.transform(language).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns.values[-1] = 'infrequent'
        return features.add_suffix('_language').astype('int8')

    @staticmethod
    def get_rating_features(metadata: pd.DataFrame, bins: List[int]) -> pd.DataFrame:
        rating_cat_mapping = {
            f'({l}, {r}]': f'rating_from_{l}_to_{r}'
            for l, r in zip(bins, bins[1:])
        }
        rating = (
            pd.cut(metadata['rating'], bins)
            .astype(str)
            .apply(lambda x: rating_cat_mapping.get(x))
            .to_frame()
        )
        encoder = OneHotEncoder()
        encoder.fit(rating)
        features = pd.DataFrame(
            data=encoder.transform(rating).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        return features.astype('int8')

    @staticmethod
    def get_prod_company_features(metadata: pd.DataFrame, n_categories: int) -> pd.DataFrame:
        company = metadata['production_company_1'].to_frame()
        encoder = OneHotEncoder(max_categories=n_categories)
        encoder.fit(company)
        features = pd.DataFrame(
            data=encoder.transform(company).todense(),
            index=metadata.index,
            columns=encoder.get_feature_names()
        )
        features.columns = features.columns.str.removeprefix('x0_')
        features.columns.values[-1] = 'infrequent_company'
        if 'None' in features.columns:
            features = features.rename({'None': 'unknown_company'}, axis=1)
        return features.add_prefix('produced_by_').astype('int8')        

    @staticmethod
    def get_boxoffice_features(
        metadata: pd.DataFrame,
        quantiles: List[float]
    ) -> pd.DataFrame:
        dollars = metadata[metadata['boxoffice_gross_worldwide'].str[0] == '$']
        dollars['decade'] = dollars['release_date'].dt.year.round(-1)
        dollars['boxoffice'] = (
            dollars['boxoffice_gross_worldwide']
            .str.replace('$', '')
            .str.replace(',', '')
            .astype('int')
        )
        dollars_pivot = (
            dollars
            .groupby('decade')
            ['boxoffice']
            .quantile(quantiles, interpolation='nearest')
            .reset_index()
            .assign(quantile=lambda x: (100*x['level_1']).astype(int))
            .pivot(index='decade', columns='quantile', values='boxoffice')
        )
        dollars_pivot[100] += 1
        
        boxoffice_cat = (
            dollars[['boxoffice', 'decade']]
            .apply(lambda x: find_boxoffice_quantile_range(
                x['boxoffice'], dollars_pivot.loc[x['decade']]
            ), axis=1)
        )
        boxoffice_cat.name = 'boxoffice_category'
        unknown_boxoffice_cat = (
            metadata[~metadata.index.isin(boxoffice_cat.index)]
            .assign(boxoffice_category='boxoffice_unknown_q')
            ['boxoffice_category']
        )
        boxoffice = pd.concat([boxoffice_cat, unknown_boxoffice_cat])
        return pd.get_dummies(boxoffice)

    @staticmethod
    def get_runtime_features(metadata, quantiles: List[float]) -> pd.DataFrame:
        labels = get_runtime_bin_labels(quantiles)
        runtime_features = pd.qcut(metadata['runtime'], quantiles, labels=labels)
        return pd.get_dummies(runtime_features)

    def transform(self, metadata: pd.DataFrame) -> pd.DataFrame:
        metadata_ = metadata.copy(deep=False)
        if self.min_num_votes:
            metadata_ = metadata_.query(f'num_votes > {self.min_num_votes}')

        features = pd.concat([
            self.get_boxoffice_features(metadata, self.boxoffice_quantiles),
            self.get_actor_features(metadata, self.n_actor_feat),
            self.get_country_features(metadata, self.n_country_feat),
            self.get_genre_features(metadata, self.n_genre_feat),
            self.get_language_features(metadata, self.n_lang_feat),
            self.get_prod_company_features(metadata, self.n_prod_comp_feat),
            self.get_rating_features(metadata, self.rating_bins),
            self.get_release_decade_features(metadata),
            self.get_runtime_features(metadata, self.runtime_quantiles)
        ], axis=1)
        return features.astype('uint8')

In [491]:
feats = FeatureTransformer(5000).transform(metadata)
feats.shape

(23583, 137)

In [492]:
feats.memory_usage(deep=True).sum()/1024/1024

4.74636173248291