In [None]:
# import packages
import pandas as pd
from datetime import datetime

import ast

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [None]:
# read the datasets
orginal_data_path = 'original_data/'

movies_metadata = pd.read_csv(orginal_data_path + 'movies_metadata.csv')
ratings = pd.read_csv(orginal_data_path + 'ratings.csv')
links = pd.read_csv(orginal_data_path + 'links.csv')
keywords = pd.read_csv(orginal_data_path + 'keywords.csv')
credits = pd.read_csv(orginal_data_path + 'credits.csv')

ratings['date_time'] = pd.to_datetime(ratings['timestamp'], unit='s')

Genre

In [None]:
# print the count of genres
movies_metadata['genres_list'] = movies_metadata['genres'].apply(ast.literal_eval)

all_genres = dict()
for genre_list in movies_metadata['genres_list']:
    for genre in genre_list:
        if genre['name'] not in all_genres.keys():
            all_genres[genre['name']] = 1
        else:
            all_genres[genre['name']] += 1

print(f"Total unique genres: {len(all_genres)}")
print(all_genres)

In [None]:
# print one-time genre
onetime_genres = []

for genre in all_genres.keys():
    if all_genres[genre] == 1:
        onetime_genres.append(genre)
  
print(onetime_genres)

In [None]:
# get the dummy variables for genres
for genre in all_genres.keys():
    movies_metadata[f"genre_{genre}"] = movies_metadata['genres_list'].apply(
        lambda x: int(any(g['name'] == genre for g in x))
    )
   
# remove the one-time genre dummy variable
movies_metadata = movies_metadata.drop([f"genre_{genre}" for genre in onetime_genres], axis = 1)

In [None]:
# get the genre columns
genre_columns = [col for col in movies_metadata.columns if col.startswith('genre_')]

# get the genre average votes dictionary
genre_avg_votes = {
    genre.split('_')[1]: movies_metadata[movies_metadata[genre] == 1]['vote_average'].mean()
    for genre in genre_columns
}

# change the dictionary to dataframe
genre_avg_df = pd.DataFrame.from_dict(genre_avg_votes, orient='index', columns=['avg_vote'])
genre_avg_df = genre_avg_df.sort_values(by='avg_vote', ascending=False)

# graph
genre_avg_df.plot(kind='barh', figsize=(6, 4), legend=False)
plt.title('Average Vote by Genre')
plt.xlabel('Average Vote')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

Release Year

In [None]:
# prepare the columns for release date/year
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')
movies_metadata['release_year'] = movies_metadata['release_date'].dt.year

In [None]:
# prepare the data
year_count_df = pd.DataFrame.from_dict(dict(movies_metadata.groupby('release_year').size()), orient='index', columns=['count'])
year_vote_count_df = movies_metadata.groupby('release_year').agg({'vote_count': 'sum'})

# draw the graph
fig, ax1 = plt.subplots(figsize=(6, 4))

# left Axis：Movie Count
ax1.set_xlabel('Year')
ax1.set_ylabel('Movie Count', color = 'tab:blue')
ax1.plot(year_count_df.index, year_count_df.values, color = 'tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')

# right Axis: Movie Votes
ax2 = ax1.twinx()
ax2.set_ylabel('Total Votes', color = 'tab:red')
ax2.plot(year_vote_count_df.index, year_vote_count_df.values / 20, color = 'tab:red')
ax2.tick_params(axis='y', labelcolor='tab:red')
ax2.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x * 20):}'))

# show the graph
plt.title('Yearly Movie Count and Total Votes')
plt.tight_layout()
plt.show()

In [None]:
# movie rating by year
year_movie_rating_df = pd.DataFrame.from_dict(movies_metadata.groupby('release_year')['vote_average'].mean().to_dict(), orient='index', columns=['count'])

# graph
year_movie_rating_df.plot(kind='line', figsize=(6, 4), legend=False)
plt.title('Movie Rating by Year')
plt.xlabel('Year')
plt.ylabel('Movie Rating')
plt.tight_layout()
plt.show()

Save csv file

In [None]:
movies_metadata.to_csv('movies_cleaned.csv', index=False)