# Exploration of IMDb Principals dataset

<li>
    The principals dataset contains film titles, cast members, their roles, etc.
</li>

In [2]:
import pandas as pd
import numpy as np

### Import the dataset

In [3]:
data_file = '../Datasets/IMDb title_principals.csv'
principals = pd.read_csv(data_file)

In [4]:
principals.head()

Unnamed: 0,imdb_title_id,ordering,imdb_name_id,category,job,characters
0,tt0000009,1,nm0063086,actress,,"[""Miss Geraldine Holbrook (Miss Jerry)""]"
1,tt0000009,2,nm0183823,actor,,"[""Mr. Hamilton""]"
2,tt0000009,3,nm1309758,actor,,"[""Chauncey Depew - the Director of the New Yor..."
3,tt0000009,4,nm0085156,director,,
4,tt0000574,1,nm0846887,actress,,"[""Kate Kelly""]"


In [5]:
char_names = principals["characters"].value_counts()
char_names

["Self"]                        720
["Anna"]                        539
["Alex"]                        460
["David"]                       455
["Sam"]                         415
                               ... 
["Arch - Rambaldo Melandri"]      1
["Charlotte Chang"]               1
["Thorsten Akrell"]               1
["Cpl. Mel Avakian"]              1
["Joe Krozac"]                    1
Name: characters, Length: 212899, dtype: int64

## Visualizing the unique character names: A Tangent

* I realized that, since this dataset contains nearly every film on IMDb, the count of unique characters in this dataset should indicate what character names are most common in all films (on IMDb).

In [45]:
from plotly import express as px

dropped = char_names[principals["characters"].value_counts() > 50]


fig = px.scatter(dropped, x = dropped.index, y=dropped, log_y = True,
                 title="Popularity of Character Names on IMDb",
                 labels={"index": "Character Names", "y": "Number of Occurrences"},
                 template='ggplot2', width=1000, height=500)
fig.update_xaxes(tickangle=60, tickmode='auto')
fig.write_html('./res/principals_vis.html')

In [46]:
from IPython.display import IFrame
IFrame(src='./res/principals_vis.html', width=1000, height=500)