In [19]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
from IPython.display import display
sb.set() 

## analysis of *casts* and success factors *(profit and popularity)*

In [20]:
factors = pd.read_pickle('factors.pkl')
cast = pd.read_pickle('cast.pkl')
director = pd.read_pickle('director.pkl')
production_companies = pd.read_pickle('production_companies.pkl')
success = pd.read_pickle('success.pkl')
genres = pd.read_pickle('genres.pkl')


#### *cast*

In [21]:
cast_combined = pd.concat([success, cast], axis=1)
cast_combined

Unnamed: 0,popularity,profit,cast
0,32.985763,1.363529e+09,Chris Pratt
0,32.985763,1.363529e+09,Bryce Dallas Howard
0,32.985763,1.363529e+09,Irrfan Khan
0,32.985763,1.363529e+09,Vincent D'Onofrio
0,32.985763,1.363529e+09,Nick Robinson
...,...,...,...
1286,1.157930,1.383000e+08,John Belushi
1286,1.157930,1.383000e+08,Tim Matheson
1286,1.157930,1.383000e+08,John Vernon
1286,1.157930,1.383000e+08,Verna Bloom


#### *production company*

In [22]:
prod_combined = pd.concat([success, production_companies], axis=1)
prod_combined

Unnamed: 0,popularity,profit,production_companies
0,32.985763,1.363529e+09,Universal Studios
0,32.985763,1.363529e+09,Amblin Entertainment
0,32.985763,1.363529e+09,Legendary Pictures
0,32.985763,1.363529e+09,Fuji Television Network
0,32.985763,1.363529e+09,Dentsu
...,...,...,...
1285,1.198849,6.970000e+07,Compass International Pictures
1285,1.198849,6.970000e+07,Falcon International Productions
1286,1.157930,1.383000e+08,Universal Pictures
1286,1.157930,1.383000e+08,Oregon Film Factory


#### *director*

In [23]:
dir_combined = pd.concat([success, director], axis=1)
dir_combined

Unnamed: 0,popularity,profit,director
0,32.985763,1.363529e+09,Colin Trevorrow
1,28.419936,2.284364e+08,George Miller
2,13.112507,1.852382e+08,Robert Schwentke
3,11.173104,1.868178e+09,J.J. Abrams
4,9.335014,1.316249e+09,James Wan
...,...,...,...
1282,1.549139,1.547778e+08,Guy Hamilton
1283,1.910465,1.301957e+08,Terence Young
1284,1.778746,7.497449e+07,Peter R. Hunt
1285,1.198849,6.970000e+07,John Carpenter


#### *genre*

In [24]:
genres_combined = pd.concat([success, genres], axis=1)
genres_combined


Unnamed: 0,popularity,profit,genres
0,32.985763,1.363529e+09,Action
0,32.985763,1.363529e+09,Adventure
0,32.985763,1.363529e+09,Science Fiction
0,32.985763,1.363529e+09,Thriller
1,28.419936,2.284364e+08,Action
...,...,...,...
1284,1.778746,7.497449e+07,Action
1284,1.778746,7.497449e+07,Thriller
1285,1.198849,6.970000e+07,Horror
1285,1.198849,6.970000e+07,Thriller


### Removing noisy data by dropping variables that appeared less than 5 times

This is to improve accuracy of our analysis by removing casts that appeared less than 5 times as the corresponding values presented by them may not be accurate in depicting the relationship with the success factors.

#### *cast*

In [25]:
# count the number of occurrences of each cast member
cast_counts = cast_combined["cast"].value_counts()

# filter out casts that appear less than 5 times
valid_casts = cast_counts[cast_counts >= 5].index.tolist()

# create a new dataframe with only the valid casts
cast_filtered = cast_combined[cast_combined["cast"].isin(valid_casts)]

num_unique_indexes = cast_filtered.index.nunique()

# display the result
print("Number of unique index values in the dataframe: ", num_unique_indexes)
cast_filtered


Number of unique index values in the dataframe:  1051


Unnamed: 0,popularity,profit,cast
0,32.985763,1.363529e+09,Chris Pratt
1,28.419936,2.284364e+08,Tom Hardy
1,28.419936,2.284364e+08,Charlize Theron
2,13.112507,1.852382e+08,Kate Winslet
2,13.112507,1.852382e+08,Miles Teller
...,...,...,...
1279,2.508235,7.639876e+07,Sean Connery
1280,0.960984,-1.400000e+07,Kurt Russell
1282,1.549139,1.547778e+08,Roger Moore
1283,1.910465,1.301957e+08,Sean Connery


#### *production companies*

In [26]:
# count the number of occurrences of each cast member
prod_counts = prod_combined["production_companies"].value_counts()

# filter out casts that appear less than 5 times
valid_prod = prod_counts[prod_counts >= 5].index.tolist()

# create a new dataframe with only the valid casts
prod_filtered = prod_combined[prod_combined["production_companies"].isin(valid_prod)]

num_unique_indexes = prod_filtered.index.nunique()

# display the result
print("Number of unique index values in the dataframe: ", num_unique_indexes)
prod_filtered

Number of unique index values in the dataframe:  1077


Unnamed: 0,popularity,profit,production_companies
0,32.985763,1.363529e+09,Amblin Entertainment
0,32.985763,1.363529e+09,Legendary Pictures
0,32.985763,1.363529e+09,Dentsu
1,28.419936,2.284364e+08,Village Roadshow Pictures
2,13.112507,1.852382e+08,Summit Entertainment
...,...,...,...
1283,1.910465,1.301957e+08,Metro-Goldwyn-Mayer (MGM)
1284,1.778746,7.497449e+07,Eon Productions
1284,1.778746,7.497449e+07,Metro-Goldwyn-Mayer (MGM)
1284,1.778746,7.497449e+07,Danjaq


#### *director*

In [27]:
# count the number of occurrences of each cast member
dir_counts = dir_combined["director"].value_counts()

# filter out casts that appear less than 5 times
valid_dir = dir_counts[dir_counts >= 2].index.tolist()

# create a new dataframe with only the valid casts
dir_filtered = dir_combined[dir_combined["director"].isin(valid_dir)]

num_unique_indexes = dir_filtered.index.nunique()

# display the result
print("Number of unique index values in the dataframe: ", num_unique_indexes)
dir_filtered

Number of unique index values in the dataframe:  814


Unnamed: 0,popularity,profit,director
1,28.419936,2.284364e+08,George Miller
2,13.112507,1.852382e+08,Robert Schwentke
3,11.173104,1.868178e+09,J.J. Abrams
4,9.335014,1.316249e+09,James Wan
5,9.110700,3.979505e+08,Alejandro GonzÃ¡lez IÃ±Ã¡rritu
...,...,...,...
1280,0.960984,-1.400000e+07,John Carpenter
1281,2.010733,4.333061e+08,William Friedkin
1282,1.549139,1.547778e+08,Guy Hamilton
1283,1.910465,1.301957e+08,Terence Young


#### *genre*


In [30]:
# count the number of occurrences of each cast member
genres_counts = genres_combined["genres"].value_counts()

# filter out casts that appear less than 5 times
valid_genres = genres_counts[genres_counts >= 5].index.tolist()

# create a new dataframe with only the valid casts
genres_filtered = genres_combined[genres_combined["genres"].isin(valid_genres)]
num_unique_indexes = genres_filtered.index.nunique()

# display the result
print("Number of unique index values in the dataframe: ", num_unique_indexes)
genres_filtered

Number of unique index values in the dataframe:  1287


Unnamed: 0,popularity,profit,genres
0,32.985763,1.363529e+09,Action
0,32.985763,1.363529e+09,Adventure
0,32.985763,1.363529e+09,Science Fiction
0,32.985763,1.363529e+09,Thriller
1,28.419936,2.284364e+08,Action
...,...,...,...
1284,1.778746,7.497449e+07,Action
1284,1.778746,7.497449e+07,Thriller
1285,1.198849,6.970000e+07,Horror
1285,1.198849,6.970000e+07,Thriller


In [31]:
cast_filtered.to_pickle('cast_filtered.pkl')
prod_filtered.to_pickle('prod_filtered.pkl')
dir_filtered.to_pickle('dir_filtered.pkl')
genres_filtered.to_pickle('genres_filtered.pkl')
