In [1]:
import pandas as pd

# read the file
file_path = 'IMDB-Movie-Data.csv'
df = pd.read_csv(file_path)

In [2]:
# Q(1)Top-3 movies with the highest ratings in 2016? 
def top_3_movies_2016(df):
    #read the movies in 2016
    movies_2016 = df[df['Year']==2016]
    #sort movies_2016 by rating 
    sorted_movies = movies_2016.sort_values(by='Rating', ascending=False)
    #find the top 3 unique  values
    top_3_ratings = sorted_movies['Rating'].unique()[:3]

    #print results
    print("Rank    Rating     Movies")
    print("--------------------------------------")
    
    rank = 1
    for rating in top_3_ratings:
        #filter
        movies_with_rating = sorted_movies[sorted_movies['Rating'] == rating]
        for _, row in movies_with_rating.iterrows():
            print(f"{rank:<7} {row['Rating']:<10} {row['Title']}")
        rank += 1

    return

In [3]:
# Q(2)The actor generating the highest average revenue? 
def actor_highest_avg_revenue(data):
    '''
    The way I define average revenue:
    Sum(the revenues of the movies that the actor worked with) / (number of the movies that the actor worked with)
    note that if the revenue is blank, we should not count the movie in
    '''
    #not including the missing values
    data_clean = data.dropna(subset=['Revenue (Millions)'])

    # create actor columns
    data_exploded = data_clean.assign(Actors=data_clean['Actors'].str.split('|')).explode('Actors')
    data_exploded['Actors'] = data_exploded['Actors'].str.strip() #remove space

    actor_revenue = data_exploded.groupby('Actors').agg(
        total_revenue=('Revenue (Millions)', 'sum'),
        movie_count=('Revenue (Millions)', 'size')
    )

    #cal. average revenue
    actor_revenue['avg_revenue'] = actor_revenue['total_revenue'] / actor_revenue['movie_count']
    #highest value
    highest_avg_revenue = actor_revenue['avg_revenue'].max()
    #actors with highest average revenue
    highest_avg_actors = actor_revenue[actor_revenue['avg_revenue'] == highest_avg_revenue].index.tolist()

    #print the result
    for actor in highest_avg_actors:
        print(f"{actor}")

    print(f"(with the highest average revenue: {highest_avg_revenue} Millions)")

    return


In [4]:
# Q(3) The average rating of Emma Watson’s movies
def avg_rating_emma_watson(df):
    emma_movies = df[df['Actors'].str.contains('Emma Watson', na=False)]
    avg_rating = emma_movies['Rating'].mean()
    print(f"{avg_rating:.3f}")

In [5]:
# Q(4) Top-3 directors who collaborate with the most actors
def top_3_directors_most_actors(df):
    director_actor_count = {}
    
    for _, row in df.iterrows():
        director = row['Director']
        actors = [actor.strip() for actor in row['Actors'].split('|')] #find actors
        
        if director not in director_actor_count:
            director_actor_count[director] = set()
        director_actor_count[director].update(actors)
    
    #find the top 3 values
    sorted_directors = sorted(director_actor_count.items(), key=lambda x: len(x[1]), reverse=True)
    top_3_values = sorted(set(len(actors) for _, actors in sorted_directors), reverse=True)[:3]
    #print(top_3_values)
    
    top_directors = []
    for director, actors in sorted_directors:
        actor_count = len(actors)
        #is the top 3?
        if actor_count in top_3_values:
            top_directors.append((director, actor_count))

    #print
    print("Rank    # of Actors     Directors")
    print("--------------------------------------")
    rank = 1
    for num in top_3_values:
        directors_with_count = [d for d in top_directors if d[1] == num]
        first_print = True
        out_str = ""
        for director, actor_count in directors_with_count:
            if(first_print): #first director in a rank
                out_str += f"{rank:<7} {num:<15} {director}"
                first_print = False
            else:
                out_str += f", {director}"
        print(out_str)
        rank += 1

    return

In [6]:
# Q(5)Top-2 actors playing in the most genres of movies?
def top_2_actors_most_genres(df):
    actor_genres = {}

    for _, row in df.iterrows():
        actors = [actor.strip() for actor in row['Actors'].split('|')] #find actors
        genres = [genre.strip() for genre in row['Genre'].split('|')] #find genres
        
        for actor in actors:
            if actor not in actor_genres:
                actor_genres[actor] = set()
            actor_genres[actor].update(genres)
    
    # Sort actors by the number of unique genres
    sorted_actors = sorted(actor_genres.items(), key=lambda x: len(x[1]), reverse=True)
    # Find the top 2 distinct values for the number of genres
    top_genre_counts = sorted(set(len(genres) for actor, genres in sorted_actors), reverse=True)[:2]
    
    #print(sorted_actors)
    result = []
    for i, count in enumerate(top_genre_counts, start=1):
        # Find actors who have this genre count
        actors_with_count = [actor for actor, genres in sorted_actors if len(genres) == count]
        result.append((i, count, actors_with_count))
    #print result
    print("Rank    # of Genres     Actors")
    print("--------------------------------------")
    for rank, count, actors in result:
        print(f"{rank:<7} {count:<15} {', '.join(actors)}")
    return

In [7]:
# Q(6)actors whose movies lead to the largest maximum gap of years?
def max_gap_years(df):
    actor_years = {}

    #add the year for each actor
    for _, row in df.iterrows():
        year = row['Year']
        actors = [actor.strip() for actor in row['Actors'].split('|')] #find actors
        for actor in actors:
            if actor not in actor_years:
                actor_years[actor] = set()
            actor_years[actor].add(year)
    
    # calculate the maximum gap of years for each actor
    actor_max_gap = {}
    for actor, years in actor_years.items():
        actor_max_gap[actor] = max(years) - min(years)
    
    # Find all actors with the largest gap
    largest_maximum_gap = max(actor_max_gap.values())
    actors_with_max_gap = [actor for actor, gap in actor_max_gap.items() if gap == largest_maximum_gap]

    #print the result
    for actor in actors_with_max_gap:
        print(actor)
    
    print(F"({len(actors_with_max_gap)} actors with the largest maximum gap of years: {largest_maximum_gap})")
    
    return

In [8]:
# Q(7)Find all actors who collaborate with Johnny Depp in direct and indirect ways
def johnny_depp_collaborators(df):
    actor_collabs = {}
    
    # find each actor's direct collaboration
    for _, row in df.iterrows():
        actors = [actor.strip() for actor in row['Actors'].split('|')] #find actors
        for actor in actors:
            if actor not in actor_collabs:
                actor_collabs[actor] = set()
            actor_collabs[actor].update(actors)
    
    # find all collaborators (direct and indirect) by a queue(list)
    collaborators = set()
    queue = ['Johnny Depp'] #target
    
    while queue:
        current_actor = queue.pop(0)
        #avoid looping
        if current_actor in collaborators:
            continue
        collaborators.add(current_actor)
        #add multiple elements after the queue(current_actor's direct collaborators)
        queue.extend(actor_collabs.get(current_actor))\
        
    #remove the target himself
    collaborators.remove('Johnny Depp')
    print(collaborators)
    print(f"(with {len(collaborators)} collaborators in total)")
    return


In [9]:
#1
print('Q1:')
top_3_movies_2016(df)
#2
print('Q2:')
actor_highest_avg_revenue(df)
#3
print('Q3:')
avg_rating_emma_watson(df)
#4
print('Q4:')
top_3_directors_most_actors(df)
#5
print('Q5:')
top_2_actors_most_genres(df)
#6
print('Q6:')
max_gap_years(df)
#7
print('Q7:')
johnny_depp_collaborators(df)


Q1:
Rank    Rating     Movies
--------------------------------------
1       8.8        Dangal
2       8.6        Kimi no na wa
3       8.4        Koe no katachi
Q2:
Daisy Ridley
John Boyega
(with the highest average revenue: 936.63 Millions)
Q3:
7.175
Q4:
Rank    # of Actors     Directors
--------------------------------------
1       28              Ridley Scott
2       24              M. Night Shyamalan
3       20              Danny Boyle, Paul W.S. Anderson
Q5:
Rank    # of Genres     Actors
--------------------------------------
1       14              Brad Pitt
2       13              Hugh Jackman, Scarlett Johansson, Amy Adams, Chloe Grace Moretz, Johnny Depp
Q6:
Christian Bale
Anne Hathaway
Hugh Jackman
Scarlett Johansson
Matt Damon
Mark Wahlberg
Brad Pitt
Christopher Plummer
Tom Hanks
Bryce Dallas Howard
Chiwetel Ejiofor
Ben Kingsley
Gerard Butler
Eva Green
Judi Dench
Will Smith
Jennifer Connelly
Tom Cruise
Emily Blunt
Kevin Spacey
Samuel L. Jackson
Steve Carell
Edward Norton
