# Set Up

Ideas for features:
- Genre
- rating: (adult vs kid)
- meta score: (critics rating)
- IMDB score: (peoples rating)
- Gross: (box office success)
- Release year

In [1]:
# Data comes from: https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows
import pandas as pd
df = pd.read_csv('imdb_top_1000.csv')

In [2]:
rating_map = {
    'U': 'Family-Friendly',
    'G': 'Family-Friendly',
    'Approved': 'Family-Friendly',
    'Passed': 'Family-Friendly',
    'UA': 'Family-Friendly',
    'PG': 'Family-Friendly',
    'TV-PG': 'Family-Friendly',
    'GP': 'Family-Friendly',
    'PG-13': 'Adult',
    'TV-14': 'Adult',
    'A': 'Adult',
    'R': 'Adult',
    'TV-MA': 'Adult',
    '16': 'Adult',
    'U/A': 'Family-Friendly',
    'Unrated': 'Unknown',
}
df['Normalized_Certificate'] = df['Certificate'].map(rating_map)
df['Normalized_Certificate'] = df['Normalized_Certificate'].fillna('Unknown')

In [46]:
def map_release_year(year):
    try:
        year = int(year)
    except:
        return 'Unknown'
    if year < 1990:
        return 'Classic'
    else:
        return 'Recent'

df['Release_Category'] = df['Released_Year'].apply(map_release_year)

In [None]:
df['Runtime_int'] = df['Runtime'].str.extract('(\d+)').astype(float)

In [34]:
df['Normalized_Meta_Score'] = df['Meta_score'] / 10
df['Normalized_Meta_Score'] = df['Normalized_Meta_Score'].fillna(df['IMDB_Rating'])

In [35]:
final = df[['Poster_Link','Series_Title','Overview','Director','Release_Category','Runtime_int','Genre','IMDB_Rating','Normalized_Meta_Score','Normalized_Certificate']]

In [58]:
def get_runtime_category(runtime, user_preference):
    """
    Determine how well a movie's runtime matches the user's preference.
    """
    runtime_categories = {
        'Very Short': (0, 45),
        'Short': (45, 90),
        'Medium': (90, 120),  
        'Long': (120, 150),  
        'Very Long': (150, float('inf'))  
    }

    min_runtime, max_runtime = runtime_categories[user_preference]
    if min_runtime <= runtime <= max_runtime:
        return 3 
    elif (min_runtime - 30 <= runtime <= min_runtime + 30) or (max_runtime - 30 <= runtime <= max_runtime + 30):
        return 2 
    else:
        return 1 


In [65]:
def apply_genre_score(row, user_inputs):
    movie_genres = row['Genre'].split(', ') if pd.notna(row['Genre']) else []
    genre_points = {'genre1': 5, 'genre2': 3, 'genre3': 1}
    
    max_points = 0
    for genre_key, points in genre_points.items():
        preferred_genre = user_inputs.get(genre_key)
        if preferred_genre:
            if any(preferred_genre.lower() in mg.lower() for mg in movie_genres):
                max_points = max(max_points, points)
    return max_points

def rank_movies(df, inputs_1, inputs_2):
    """
    Rank movies based on content rating and genre preferences from two users.
    
    df: DataFrame of movies.
    inputs_1: Dictionary with user 1's preferences.
    inputs_2: Dictionary with user 2's preferences. 
    Example:
        {'review_preference': 'Audience', 
        'genre1':'Comedy', 'genre2':'Drama', 'genre3':'Adventure',
        'release_category': 'Recent',
        'runtime_preference': 'Medium',}
    
    Returns: Best movie based on inputs from both users.
    """
    df['score'] = 0

    release_category_1 = inputs_1.get('release_category', None)
    release_category_2 = inputs_2.get('release_category', None)

    if release_category_1 == release_category_2:
        if release_category_1 == 'Modern':
            df = df[df['Release_Category'] == 'Modern']
        elif release_category_1 == 'Classic':
            df = df[df['Release_Category'] == 'Classic']

    critic_preference_1 = inputs_1.get('review_preference', 'Neutral')
    critic_preference_2 = inputs_2.get('review_preference', 'Neutral')

    if critic_preference_1 == 'Critics' and critic_preference_2 == 'Critics':
        critic_weight, audience_weight = 0.7, 0.3
    elif critic_preference_1 == 'Audience' and critic_preference_2 == 'Audience':
        critic_weight, audience_weight = 0.3, 0.7
    elif (critic_preference_1 == 'Critics' and critic_preference_2 == 'Neutral') or (critic_preference_2 == 'Critics' and critic_preference_1 == 'Neutral'):
        critic_weight, audience_weight = 0.6, 0.4
    elif (critic_preference_1 == 'Audience' and critic_preference_2 == 'Neutral') or (critic_preference_2 == 'Audience' and critic_preference_1 == 'Neutral'):
        critic_weight, audience_weight = 0.4, 0.6
    else:
        critic_weight, audience_weight = 0.5, 0.5

    df['score'] += (critic_weight * df['Normalized_Meta_Score']) + (audience_weight * df['IMDB_Rating'])

    for user_inputs in [inputs_1, inputs_2]:
        df['score'] += df.apply(lambda row: apply_genre_score(row, user_inputs), axis=1)

    df['score'] += df['Runtime_int'].apply(lambda x: get_runtime_category(x, inputs_1['runtime_preference']))
    df['score'] += df['Runtime_int'].apply(lambda x: get_runtime_category(x, inputs_2['runtime_preference']))

    df_sorted = df.sort_values(by='score', ascending=False)
    top_10 = df_sorted.head(10)
    random_top_3 = top_10.sample(n=3, random_state=42)

    return random_top_3



In [70]:
inputs_1 = {
    'review_preference': 'Audience',
    'genre1': 'Comedy',
    'genre2': 'Drama',
    'genre3': 'Adventure',
    'release_category': 'Modern',
    'runtime_preference': 'Short',
}

inputs_2 = {
    'review_preference': 'Critics',
    'genre1': 'Thriller',
    'genre2': 'Drama',
    'genre3': 'Action',
    'release_category': 'Modern',
    'runtime_preference': 'Long',
}

In [None]:
ranked_df = rank_movies(final, inputs_1, inputs_2)

In [72]:
ranked_df.sort_values(by='score', ascending=False)

Unnamed: 0,Poster_Link,Series_Title,Overview,Director,Release_Category,Runtime_int,Genre,IMDB_Rating,Normalized_Meta_Score,Normalized_Certificate,score
826,https://m.media-amazon.com/images/M/MV5BMTgxMD...,Barton Fink,A renowned New York playwright is enticed to C...,Joel Coen,Modern,116.0,"Comedy, Drama, Thriller",7.7,6.9,Family-Friendly,21.3
392,https://m.media-amazon.com/images/M/MV5BY2QzMT...,Secrets & Lies,"Following the death of her adoptive parents, a...",Mike Leigh,Modern,136.0,"Comedy, Drama",8.0,9.1,Family-Friendly,20.55
514,https://m.media-amazon.com/images/M/MV5BMzY1Zj...,Almost Famous,A high-school boy is given the chance to write...,Cameron Crowe,Modern,122.0,"Adventure, Comedy, Drama",7.9,9.0,Adult,20.45


In [52]:
df['Runtime_int'].describe()

count    1000.000000
mean      122.891000
std        28.093671
min        45.000000
25%       103.000000
50%       119.000000
75%       137.000000
max       321.000000
Name: Runtime_int, dtype: float64

In [24]:
df[df['Normalized_Certificate'] == 'Family-Friendly']['Certificate'].value_counts()

Certificate
U           234
UA          175
PG           37
Passed       34
G            12
Approved     11
TV-PG         3
GP            2
U/A           1
Name: count, dtype: int64

In [26]:
df[df['Normalized_Certificate'] == 'Family-Friendly'][['Series_Title', 'Certificate']].head(30)

Unnamed: 0,Series_Title,Certificate
2,The Dark Knight,UA
4,12 Angry Men,U
5,The Lord of the Rings: The Return of the King,U
8,Inception,UA
10,The Lord of the Rings: The Fellowship of the Ring,U
11,Forrest Gump,UA
13,The Lord of the Rings: The Two Towers,UA
16,Star Wars: Episode V - The Empire Strikes Back,UA
20,Soorarai Pottru,U
21,Interstellar,UA
