In [None]:
# import packages
import pandas as pd

import ast

from sklearn.preprocessing import MultiLabelBinarizer

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [None]:
# read datasets
cleaned_data_path = 'cleaned_data/'

movies_cleaned = pd.read_csv(cleaned_data_path + 'movies_cleaned.csv')

genre_map = pd.read_excel(cleaned_data_path + 'movie_metadata_supporting.xlsx', sheet_name = 'genres')

In [None]:
# read the columns as list and map the dict
def col_list_map(original_data, mapping_dict):
    try:
        final_list = []
        for item in original_data:
            final_list.append(mapping_dict[item])
        return final_list
    except:
        return None

In [None]:
# get dummy variables of genres
movies_cleaned['release_year'] = pd.to_datetime(movies_cleaned['release_date'], errors='coerce').dt.year

movies_cleaned['genre_list'] = movies_cleaned['genre_list'].apply(ast.literal_eval)

movies_cleaned['genre_name_list'] = movies_cleaned['genre_list'].apply(lambda x: col_list_map(x, dict(zip(genre_map['id'], genre_map['name']))))

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(
    mlb.fit_transform(movies_cleaned['genre_name_list']),
    columns = [f"genre_{c}" for c in mlb.classes_],
    index = movies_cleaned.index
)

movies_cleaned = pd.concat([movies_cleaned, genre_dummies], axis=1)

genre

In [None]:
# get the genre columns
genre_columns = [col for col in movies_cleaned.columns if col.startswith('genre_') and 'name' not in col and 'list' not in col]

# get the genre average votes dictionary
genre_avg_votes = {
    genre.split('_')[1]: movies_cleaned[movies_cleaned[genre] == 1]['vote_average'].mean()
    for genre in genre_columns
}

# change the dictionary to dataframe
genre_avg_df = pd.DataFrame.from_dict(genre_avg_votes, orient='index', columns=['avg_vote'])
genre_avg_df = genre_avg_df.sort_values(by='avg_vote', ascending=False)

# graph
genre_avg_df.plot(kind='barh', figsize=(6, 4), legend=False)
plt.title('Average Vote by Genre')
plt.xlabel('Average Vote')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

Release Year

In [None]:
# prepare the data
year_count_df = pd.DataFrame.from_dict(dict(movies_cleaned.groupby('release_year').size()), orient='index', columns=['count'])
year_vote_count_df = movies_cleaned.groupby('release_year').agg({'vote_count': 'sum'})

# draw the graph
fig, ax1 = plt.subplots(figsize=(6, 4))

# left Axisï¼šMovie Count
ax1.set_xlabel('Year')
ax1.set_ylabel('Movie Count', color = 'tab:blue')
ax1.plot(year_count_df.index, year_count_df.values, color = 'tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')

# right Axis: Movie Votes
ax2 = ax1.twinx()
ax2.set_ylabel('Total Votes', color = 'tab:red')
ax2.plot(year_vote_count_df.index, year_vote_count_df.values / 20, color = 'tab:red')
ax2.tick_params(axis='y', labelcolor='tab:red')
ax2.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x * 20):}'))

# show the graph
plt.title('Yearly Movie Count and Total Votes')
plt.tight_layout()
plt.show()

In [None]:
# movie rating by year
year_movie_rating_df = pd.DataFrame.from_dict(movies_cleaned.groupby('release_year')['vote_average'].mean().to_dict(), orient='index', columns=['count'])

# graph
year_movie_rating_df.plot(kind='line', figsize=(6, 4), legend=False)
plt.title('Movie Rating by Year')
plt.xlabel('Year')
plt.ylabel('Movie Rating')
plt.tight_layout()
plt.show()