In [189]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
from scipy.stats import f_oneway
from IPython.display import display
sb.set() 

## analysis of *casts* and success factors *(profit and popularity)*

In [197]:
factors = pd.read_pickle('factors.pkl')
cast = pd.read_pickle('cast.pkl')
director = pd.read_pickle('director.pkl')
production_companies = pd.read_pickle('production_companies.pkl')
success = pd.read_pickle('success.pkl')


#### *cast*

In [198]:
cast_combined = pd.concat([success, cast], axis=1)
cast_combined

Unnamed: 0,popularity,profit,cast
0,32.985763,1.363529e+09,Chris Pratt
0,32.985763,1.363529e+09,Bryce Dallas Howard
0,32.985763,1.363529e+09,Irrfan Khan
0,32.985763,1.363529e+09,Vincent D'Onofrio
0,32.985763,1.363529e+09,Nick Robinson
...,...,...,...
1286,1.157930,1.383000e+08,John Belushi
1286,1.157930,1.383000e+08,Tim Matheson
1286,1.157930,1.383000e+08,John Vernon
1286,1.157930,1.383000e+08,Verna Bloom


#### *production company*

In [199]:
prod_combined = pd.concat([success, production_companies], axis=1)
prod_combined

Unnamed: 0,popularity,profit,production_companies
0,32.985763,1.363529e+09,Universal Studios
0,32.985763,1.363529e+09,Amblin Entertainment
0,32.985763,1.363529e+09,Legendary Pictures
0,32.985763,1.363529e+09,Fuji Television Network
0,32.985763,1.363529e+09,Dentsu
...,...,...,...
1285,1.198849,6.970000e+07,Compass International Pictures
1285,1.198849,6.970000e+07,Falcon International Productions
1286,1.157930,1.383000e+08,Universal Pictures
1286,1.157930,1.383000e+08,Oregon Film Factory


### Removing noisy data by dropping variables that appeared less than 5 times

This is to improve accuracy of our analysis by removing casts that appeared less than 5 times as the corresponding values presented by them may not be accurate in depicting the relationship with the success factors.

#### *cast*

In [191]:
# count the number of occurrences of each cast member
cast_counts = cast_combined["cast"].value_counts()

# filter out casts that appear less than 5 times
valid_casts = cast_counts[cast_counts >= 5].index.tolist()

# create a new dataframe with only the valid casts
cast_filtered = cast_combined[cast_combined["cast"].isin(valid_casts)]

num_unique_indexes = cast_filtered.index.nunique()

# display the result
print("Number of unique index values in the dataframe: ", num_unique_indexes)
cast_filtered


Number of unique index values in the dataframe:  1051


Unnamed: 0,popularity,profit,cast
0,32.985763,1.363529e+09,Chris Pratt
1,28.419936,2.284364e+08,Tom Hardy
1,28.419936,2.284364e+08,Charlize Theron
2,13.112507,1.852382e+08,Kate Winslet
2,13.112507,1.852382e+08,Miles Teller
...,...,...,...
1279,2.508235,7.639876e+07,Sean Connery
1280,0.960984,-1.400000e+07,Kurt Russell
1282,1.549139,1.547778e+08,Roger Moore
1283,1.910465,1.301957e+08,Sean Connery


#### *production companies*

In [200]:
# count the number of occurrences of each cast member
prod_counts = prod_combined["production_companies"].value_counts()

# filter out casts that appear less than 5 times
valid_prod = prod_counts[prod_counts >= 5].index.tolist()

# create a new dataframe with only the valid casts
prod_filtered = prod_combined[prod_combined["production_companies"].isin(valid_prod)]

num_unique_indexes = prod_filtered.index.nunique()

# display the result
print("Number of unique index values in the dataframe: ", num_unique_indexes)
prod_filtered

Number of unique index values in the dataframe:  1077


Unnamed: 0,popularity,profit,production_companies
0,32.985763,1.363529e+09,Amblin Entertainment
0,32.985763,1.363529e+09,Legendary Pictures
0,32.985763,1.363529e+09,Dentsu
1,28.419936,2.284364e+08,Village Roadshow Pictures
2,13.112507,1.852382e+08,Summit Entertainment
...,...,...,...
1283,1.910465,1.301957e+08,Metro-Goldwyn-Mayer (MGM)
1284,1.778746,7.497449e+07,Eon Productions
1284,1.778746,7.497449e+07,Metro-Goldwyn-Mayer (MGM)
1284,1.778746,7.497449e+07,Danjaq


### Depiction of mean of *popularity* and *profit* for each variable

#### *cast*

In [192]:
mean_profit_by_cast = cast_filtered.groupby("cast")["profit"].mean()
mean_popularity_by_cast = cast_filtered.groupby("cast")["popularity"].mean()
display(mean_profit_by_cast, mean_popularity_by_cast)

cast
Aaron Eckhart        1.768422e+08
Abbie Cornish        4.404650e+07
Abigail Breslin      5.270388e+07
Adam Sandler         1.015443e+08
Al Pacino            1.228691e+08
                         ...     
Winona Ryder         6.675413e+07
Woody Harrelson      2.287871e+08
Zach Galifianakis    1.469285e+08
Zoe Saldana          6.349191e+08
Zooey Deschanel      3.853914e+07
Name: profit, Length: 321, dtype: float64

cast
Aaron Eckhart        2.018311
Abbie Cornish        1.379922
Abigail Breslin      1.463021
Adam Sandler         1.677745
Al Pacino            2.152814
                       ...   
Winona Ryder         1.689443
Woody Harrelson      2.796703
Zach Galifianakis    1.534692
Zoe Saldana          5.319022
Zooey Deschanel      1.341595
Name: popularity, Length: 321, dtype: float64

#### *production company*

In [203]:
mean_profit_by_prod = prod_filtered.groupby("production_companies")["profit"].mean()
mean_popularity_by_prod = prod_filtered.groupby("production_companies")["popularity"].mean()
display(mean_profit_by_prod, mean_popularity_by_prod)

production_companies
1492 Pictures                        2.580206e+08
20th Century Fox                     9.376905e+07
21 Laps Entertainment                1.021079e+08
3 Arts Entertainment                 6.062205e+07
Affirm Films                         3.315747e+07
                                         ...     
Warner Bros. Animation               9.523564e+07
Warner Independent Pictures (WIP)    1.202267e+07
Weed Road Pictures                   1.637005e+08
WingNut Films                        5.280403e+08
Working Title Films                  9.564426e+07
Name: profit, Length: 155, dtype: float64

production_companies
1492 Pictures                        2.648557
20th Century Fox                     1.527272
21 Laps Entertainment                1.343531
3 Arts Entertainment                 1.819087
Affirm Films                         0.492822
                                       ...   
Warner Bros. Animation               1.234895
Warner Independent Pictures (WIP)    0.688099
Weed Road Pictures                   1.439762
WingNut Films                        5.098463
Working Title Films                  1.562140
Name: popularity, Length: 155, dtype: float64

### Analysing correlation between *cast* and success factors

The code below shows the correlation between cast and the success factors(profit and popularity).<br>
This is done through the use of one-hot encoding by coverting categorical data to numerical data.


In [201]:
# use one hot encoding to create binary indicator variables for the cast
cast_dummies = pd.get_dummies(cast_filtered['cast'])

cast_success = pd.concat([cast_dummies, cast_filtered], axis=1)

# drop the original 'cast' column since it is no longer needed
cast_data = cast_success.drop('cast', axis=1)
cast_popularity = cast_data.drop('profit', axis=1)
cast_profit = cast_data.drop('popularity', axis=1)

cast_profit = cast_profit.groupby(cast_profit.index).sum()
cast_popularity = cast_popularity.groupby(cast_popularity.index).sum()


In [205]:
corr_matrix_cast_profit = cast_profit.corr()
target_column = 'profit'
corr_cast_profit = corr_matrix_cast_profit[target_column]

# Sort the correlations in descending order
corr_cast_profit = corr_cast_profit.sort_values(ascending=False)

print(corr_cast_profit)

profit            1.000000
Orlando Bloom     0.259912
Rupert Grint      0.243882
Ian McKellen      0.221418
Emma Watson       0.221074
                    ...   
Paul Giamatti    -0.037661
Colin Farrell    -0.042037
Kurt Russell     -0.042403
Susan Sarandon   -0.044264
Julianne Moore   -0.045652
Name: profit, Length: 322, dtype: float64


In [206]:
corr_matrix_cast_pop = cast_popularity.corr()
target_column = 'popularity'
corr_cast_pop = corr_matrix_cast_pop[target_column]

# Sort the correlations in descending order
corr_cast_pop = corr_cast_pop.sort_values(ascending=False)

print(corr_cast_pop)

popularity          1.000000
Michael Caine       0.281095
Orlando Bloom       0.228840
Ian McKellen        0.203125
Casey Affleck       0.199023
                      ...   
Milla Jovovich     -0.036162
Roger Moore        -0.036238
Donald Pleasence   -0.036757
Susan Sarandon     -0.037712
Paul Giamatti      -0.038377
Name: popularity, Length: 322, dtype: float64


### Analysing correlation between *production company* and success factors

The code below shows the correlation between production company and the success factors(profit and popularity).<br>
This is done through the use of one-hot encoding by coverting categorical data to numerical data.

In [204]:
# use one hot encoding to create binary indicator variables for the cast
prod_dummies = pd.get_dummies(prod_filtered['production_companies'])

prod_success = pd.concat([prod_dummies, prod_filtered], axis=1)

# drop the original 'cast' column since it is no longer needed
prod_data = prod_success.drop('production_companies', axis=1)
prod_popularity = prod_data.drop('profit', axis=1)
prod_profit = prod_data.drop('popularity', axis=1)

prod_profit = prod_profit.groupby(prod_profit.index).sum()
prod_popularity = prod_popularity.groupby(prod_popularity.index).sum()


In [207]:
corr_matrix_prod_profit = prod_profit.corr()
target_column = 'profit'
corr_prod_profit = corr_matrix_prod_profit[target_column]

# Sort the correlations in descending order
corr_prod_profit = corr_prod_profit.sort_values(ascending=False)

print(corr_prod_profit)

profit                      1.000000
Syncopy                     0.223022
Heyday Films                0.221612
Dentsu                      0.189915
Amblin Entertainment        0.180829
                              ...   
Screen Gems                -0.046014
Morgan Creek Productions   -0.048024
BBC Films                  -0.048527
Dimension Films            -0.053268
Miramax Films              -0.070913
Name: profit, Length: 156, dtype: float64


In [208]:
corr_matrix_prod_pop = prod_popularity.corr()
target_column = 'popularity'
corr_prod_pop = corr_matrix_prod_pop[target_column]

# Sort the correlations in descending order
corr_prod_pop = corr_prod_pop.sort_values(ascending=False)

print(corr_prod_pop)

popularity            1.000000
Syncopy               0.379647
Legendary Pictures    0.300930
Dentsu                0.219249
Warner Bros.          0.154385
                        ...   
BBC Films            -0.039286
Lions Gate Films     -0.040814
UK Film Council      -0.041714
Dimension Films      -0.045426
Miramax Films        -0.059953
Name: popularity, Length: 156, dtype: float64
