# Set Up

Ideas for features:
- Genre
- rating: (adult vs kid)
- meta score: (critics rating)
- IMDB score: (peoples rating)
- Gross: (box office success)
- Release year

In [103]:
# Data comes from: https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows
import pandas as pd
df = pd.read_csv('imdb_top_1000.csv')

In [104]:
rating_map = {
    'U': 'Family-Friendly',
    'G': 'Family-Friendly',
    'Approved': 'Family-Friendly',
    'Passed': 'Family-Friendly',
    'UA': 'Family-Friendly',
    'PG': 'Family-Friendly',
    'TV-PG': 'Family-Friendly',
    'GP': 'Family-Friendly',
    'PG-13': 'Adult',
    'TV-14': 'Adult',
    'A': 'Adult',
    'R': 'Adult',
    'TV-MA': 'Adult',
    '16': 'Adult',
    'U/A': 'Family-Friendly',
    'Unrated': 'Unknown',
}
df['Normalized_Certificate'] = df['Certificate'].map(rating_map)
df['Normalized_Certificate'] = df['Normalized_Certificate'].fillna('Unknown')

In [105]:
unique_genres = set()
df['Genre'] = df['Genre'].fillna('') 
for genres in df['Genre']:
    for g in genres.split(','):
        unique_genres.add(g.strip())
for genre in unique_genres:
    df[genre] = df['Genre'].apply(lambda x: genre in [g.strip() for g in x.split(',')])

In [106]:
def map_release_year(year):
    try:
        year = int(year)
    except:
        return 'Unknown'
    if year < 1980:
        return 'Classic'
    elif year < 2010:
        return 'Modern'
    else:
        return 'Recent'

df['Release_Category'] = df['Released_Year'].apply(map_release_year)

In [None]:
df['Runtime_int'] = df['Runtime'].str.extract('(\d+)').astype(float)

In [108]:
df['Normalized_Meta_Score'] = df['Meta_score'] / 10

In [110]:
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross',
       'Normalized_Certificate', 'Western', 'Comedy', 'Film-Noir', 'Sport',
       'Musical', 'Thriller', 'Sci-Fi', 'Mystery', 'Animation', 'Music',
       'Drama', 'Action', 'Family', 'Crime', 'Romance', 'Fantasy', 'Biography',
       'Horror', 'History', 'War', 'Adventure', 'Release_Category',
       'Runtime_int', 'Normalized_Meta_Score'],
      dtype='object')

In [112]:
final = df[['Poster_Link','Series_Title','Overview','Director','Release_Category','Runtime_int','Genre','IMDB_Rating','Normalized_Meta_Score','Normalized_Certificate']]

In [115]:
final.head(2)

Unnamed: 0,Poster_Link,Series_Title,Overview,Director,Release_Category,Runtime_int,Genre,IMDB_Rating,Normalized_Meta_Score,Normalized_Certificate
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,Two imprisoned men bond over a number of years...,Frank Darabont,Modern,142.0,Drama,9.3,8.0,Adult
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,Classic,175.0,"Crime, Drama",9.2,10.0,Adult


In [118]:
def rank_movies(df, inputs_1, inputs_2):
    """
    Rank movies based on content rating and genre preferences from two users.
    
    df: DataFrame of movies.
    inputs_1: Dictionary with user 1's preferences.
    inputs_2: Dictionary with user 2's preferences. 
    Example:
        {'content_rating': 'Adult', 'genre1':'Comedy', 'genre2':'Drama' 'genre3':'Adventure'}
    
    Returns: Best movie based on inputs from both users.
    """
    df['score'] = 0

    if inputs_1['content_rating'] == inputs_2['content_rating']:
        df = df[df['Normalized_Certificate'] == inputs_1['content_rating']]
    else:
        pass

    df.loc[df['Genre'].str.contains(inputs_1['genre1'], case=False, na=False), 'score'] += 3
    df.loc[df['Genre'].str.contains(inputs_2['genre1'], case=False, na=False), 'score'] += 3

    return df



In [116]:
inputs_1 = {
    'content_rating': 'Adult',
    'genre1': 'Comedy',
    'genre2': 'Drama',
    'genre3': 'Adventure'
}

inputs_2 = {
    'content_rating': 'Adult',
    'genre1': 'Drama',
    'genre2': 'Thriller',
    'genre3': 'Action'
}

In [None]:
ranked_df = rank_movies(final, inputs_1, inputs_2)