In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [53]:
# Load movie+book data
import pandas as pd
all_data_df = pd.read_pickle('../dump/all_correctRT_data')

In [54]:
all_data_df.columns

Index(['movie_title', 'rating', 'vote', 'certificate', 'genre', 'release_date',
       'metascore', 'keywords', 'budget', 'opening_weekend_usa', 'gross_usa',
       'gross_world', 'runtime', 'director', 'link_d', 'writer', 'link_w',
       'star', 'link_s', 'distributor', 'language', 'country', 'book_title',
       'author', 'rating_value', 'rating_count', 'review_count', 'page',
       'year', 'title', 'book_popularity', 'author_popularity'],
      dtype='object')

In [55]:
all_df = all_data_df.drop(columns=['vote','metascore','keywords',\
                                   'link_d','link_s','link_w',\
                                   'book_title','rating_value','rating_count','review_count','title']).\
                    drop_duplicates(subset=['movie_title','director'])
all_df.rename(columns = {'certificate':'MPAA','star':'actor','year':'publish_year'},inplace=True)

In [56]:
all_df = all_df.dropna().reset_index()

In [57]:
all_df.head()

Unnamed: 0,index,movie_title,rating,MPAA,genre,release_date,budget,opening_weekend_usa,gross_usa,gross_world,...,writer,actor,distributor,language,country,author,page,publish_year,book_popularity,author_popularity
0,1,The Godfather,9.2,R,"[Crime, Drama]",1972-03-24,6000000.0,302393.0,134966411.0,246121000.0,...,"[Mario Puzo, Francis Ford Coppola]","[Marlon Brando, Al Pacino, James Caan]",ParamountPictures,"[English, Italian, Latin]",[USA],Mario Puzo,448.0,1969.0,0.16,0.0699
1,2,Harry Potter and the Sorcerer's Stone,7.6,PG,"[Adventure, Family, Fantasy]",2001-11-16,125000000.0,90294621.0,318087620.0,1006918000.0,...,"[J.K. Rowling, Steve Kloves]","[Daniel Radcliffe, Rupert Grint, Richard Harris]",WarnerBros.,[English],"[UK, USA]",J.K. Rowling,309.0,1997.0,0.14,0.0149
2,4,Little Women,7.8,PG,"[Drama, Romance]",2019-12-25,40000000.0,16755310.0,108101214.0,216601200.0,...,"[Greta Gerwig, Louisa May Alcott]","[Saoirse Ronan, Emma Watson, Florence Pugh]",ColumbiaPictures,"[English, French]",[USA],Louisa May Alcott,449.0,1868.0,0.0,0.0151
3,8,Little Women,7.3,PG,"[Drama, Family, Romance]",1994-12-25,18000000.0,2411247.0,50083616.0,50083620.0,...,"[Louisa May Alcott, Robin Swicord]","[Susan Sarandon, Winona Ryder, Kirsten Dunst]",ColumbiaPictures,"[English, German, French]","[USA, Canada]",Louisa May Alcott,449.0,1868.0,0.0,0.0151
4,20,Ready Player One,7.5,PG-13,"[Action, Adventure, Sci-Fi]",2018-03-29,175000000.0,41764050.0,137690172.0,582893700.0,...,"[Zak Penn, Ernest Cline]","[Tye Sheridan, Olivia Cooke, Ben Mendelsohn]",WarnerBros.,[English],"[USA, India]",Ernest Cline,374.0,2011.0,0.0,0.0012


In [58]:
all_data_df.shape

(1575, 32)

In [59]:
all_df.shape

(581, 22)

In [60]:
all_df.columns

Index(['index', 'movie_title', 'rating', 'MPAA', 'genre', 'release_date',
       'budget', 'opening_weekend_usa', 'gross_usa', 'gross_world', 'runtime',
       'director', 'writer', 'actor', 'distributor', 'language', 'country',
       'author', 'page', 'publish_year', 'book_popularity',
       'author_popularity'],
      dtype='object')

## Collect more features

### 1. release time of the movie

In [61]:
# Divide release_date into year, month, day of week
all_df['release_year'] = all_df['release_date'].dt.year
all_df['release_month'] = all_df['release_date'].dt.month
all_df['dow'] = all_df['release_date'].dt.weekday

### 2. director value

In [62]:
# Find corresponding value of each director when movie is released
# ['avg_rating','avg_gross'] (per movie directed)
director_df = pd.read_pickle('../dump/director_data')

In [63]:
# Find the mean rating, gross of all directors
director_rating_mean = int(director_df.rating.mean())
director_gross_mean = int(director_df.gross_usa.mean())

director_rating_mean, director_gross_mean

(6, 32634196)

In [64]:
def director_value(movie):
    """
    input: movie (each row in all_data_df)
    output: 
        - film_count: number of movies directed before the movie of interest
        - avg_rating: average rating of movies directed before the movie of interest
        - avg_gross: gross per movie before the movie of interest
    """
    
    movie_title = movie.movie_title
    director = movie.director
    year = movie.release_date

    headers = ['movie_title','director','film_count_d','avg_rating_d','avg_gross_d']
    
    # Assign default values
    film_count,avg_rating,avg_gross = 0, director_rating_mean, director_gross_mean
    
    doi_df = director_df[(director_df.director == director) & (director_df.year < year)]
    
    # Fill NaN with director's mean
    doi_df[['rating','gross_usa']].apply(lambda x: x.fillna(x.mean(),axis=0))
    
    # If there's still NaN, fill with all directors' mean
    doi_df[['rating']] = doi_df[['rating']].apply(lambda x: x.fillna(director_rating_mean,axis=0))
    doi_df[['gross_usa']] = doi_df[['gross_usa']].apply(lambda x: x.fillna(director_gross_mean,axis=0))
    
   
    if doi_df.shape[0] == 0:
        film_count,avg_rating,avg_gross = 0, director_rating_mean, director_gross_mean
    else:
        
    
    
        film_count = doi_df.shape[0]

        
        avg_rating = doi_df['rating'].mean()
        if avg_rating == np.nan:
            avg_rating = director_rating_mean

        try:
            avg_gross = int(doi_df['gross_usa'].mean())
        except ValueError:
            avg_gross = director_gross_mean
        
    
    director_value = dict(zip(headers, [movie_title,director,film_count,avg_rating,avg_gross]))
    
    return director_value

In [65]:
# Create df including columns of director film_count, avg_rating, avg_gross
# Rows correspond to rows in all_df
movie_director_df = all_df.dropna().apply(director_value,axis=1).apply(pd.Series)

In [66]:
movie_director_df

Unnamed: 0,movie_title,director,film_count_d,avg_rating_d,avg_gross_d
0,The Godfather,Francis Ford Coppola,7,5.285714,32634196
1,Harry Potter and the Sorcerer's Stone,Chris Columbus,9,6.622222,106588545
2,Little Women,Greta Gerwig,2,6.700000,24481851
3,Little Women,Gillian Armstrong,11,6.663636,24220795
4,Ready Player One,Steven Spielberg,34,7.202941,132444136
...,...,...,...,...,...
576,Never Die Alone,Ernest R. Dickerson,7,6.157143,20446469
577,Radio Free Albemuth,John Alan Simon,0,6.000000,32634196
578,The Golden Bowl,James Ivory,23,6.317391,18103663
579,The Good Mother,Leonard Nimoy,3,6.700000,117988379


In [67]:
# Merge all_df and director value
all_d_df = pd.merge(all_df, movie_director_df, left_index=True, right_index=True ,how='left')

In [68]:
# Merge all_df and director value
all_d_df = pd.merge(all_df, movie_director_df, on=['movie_title','director'] ,how='left')

In [69]:
all_d_df

Unnamed: 0,index,movie_title,rating,MPAA,genre,release_date,budget,opening_weekend_usa,gross_usa,gross_world,...,page,publish_year,book_popularity,author_popularity,release_year,release_month,dow,film_count_d,avg_rating_d,avg_gross_d
0,1,The Godfather,9.2,R,"[Crime, Drama]",1972-03-24,6000000.0,302393.0,134966411.0,2.461210e+08,...,448.0,1969.0,0.16,0.0699,1972,3,4,7,5.285714,32634196
1,2,Harry Potter and the Sorcerer's Stone,7.6,PG,"[Adventure, Family, Fantasy]",2001-11-16,125000000.0,90294621.0,318087620.0,1.006918e+09,...,309.0,1997.0,0.14,0.0149,2001,11,4,9,6.622222,106588545
2,4,Little Women,7.8,PG,"[Drama, Romance]",2019-12-25,40000000.0,16755310.0,108101214.0,2.166012e+08,...,449.0,1868.0,0.00,0.0151,2019,12,2,2,6.700000,24481851
3,8,Little Women,7.3,PG,"[Drama, Family, Romance]",1994-12-25,18000000.0,2411247.0,50083616.0,5.008362e+07,...,449.0,1868.0,0.00,0.0151,1994,12,6,11,6.663636,24220795
4,20,Ready Player One,7.5,PG-13,"[Action, Adventure, Sci-Fi]",2018-03-29,175000000.0,41764050.0,137690172.0,5.828937e+08,...,374.0,2011.0,0.00,0.0012,2018,3,3,34,7.202941,132444136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576,1552,Never Die Alone,5.7,R,"[Action, Crime, Drama]",2004-03-26,3000000.0,3089993.0,5645298.0,5.923000e+06,...,480.0,2015.0,0.26,0.0093,2004,3,4,7,6.157143,20446469
577,1554,Radio Free Albemuth,5.7,R,"[Drama, Sci-Fi]",2014-06-27,3600000.0,5553.0,9365.0,9.365000e+03,...,214.0,1985.0,0.00,0.0039,2014,6,4,0,6.000000,32634196
578,1559,The Golden Bowl,5.9,R,"[Drama, Romance]",2001-05-25,15000000.0,90170.0,3050532.0,5.753678e+06,...,591.0,1904.0,0.36,0.7878,2001,5,4,23,6.317391,18103663
579,1567,The Good Mother,5.8,R,"[Drama, Romance]",1988-11-04,14000000.0,1804288.0,4764606.0,4.764606e+06,...,320.0,1986.0,1.56,0.2796,1988,11,4,3,6.700000,117988379


In [70]:
all_d_df.columns

Index(['index', 'movie_title', 'rating', 'MPAA', 'genre', 'release_date',
       'budget', 'opening_weekend_usa', 'gross_usa', 'gross_world', 'runtime',
       'director', 'writer', 'actor', 'distributor', 'language', 'country',
       'author', 'page', 'publish_year', 'book_popularity',
       'author_popularity', 'release_year', 'release_month', 'dow',
       'film_count_d', 'avg_rating_d', 'avg_gross_d'],
      dtype='object')

### 3. cast (actors) value

In [74]:
# Find corresponding value of each director when movie is released
# ['avg_rating','avg_gross'] (per movie directed)
actor_df = pd.read_pickle('../dump/actor_correct_data')
actor_df.shape

(86790, 15)

In [75]:
actor_df.head()

Unnamed: 0,actor,title,year,rating,vote,genre_list,budget,opening,gross_usa,gross_cw,runtime,director,writer,star,distributor
0,Josephine Langford,Ray Meets Helen,2018-05-04,5.3,151.0,"[Drama, Fantasy, Romance]",,,,,100,Alan Rudolph,[Alan Rudolph],"[Keith Carradine, Sondra Locke, Keith David]",
1,Josephine Langford,The Prophet's Game,2001-04-10,5.0,913.0,"[Crime, Drama, Thriller]",28000000.0,,,,106,David Worth,[Carol Chrest],"[Dennis Hopper, Stephanie Zimbalist, Robert Yo...",MoonstoneEntertainment
2,Josephine Langford,Clean and Narrow,2000-01-21,5.3,129.0,"[Crime, Drama]",2500000.0,,,,82,William Katt,"[Kermit Christman, William Katt]","[Jack Noseworthy, Laura Leighton, Jackie Belvin]",DelMarProductions
3,Josephine Langford,Ratboy,1986-10-17,3.7,599.0,"[Comedy, Crime, Drama]",,,,,104,Sondra Locke,[Rob Thompson],"[Sondra Locke, Robert Townsend, Christopher He...",TheMalpasoCompany
4,Josephine Langford,Amazing,NaT,7.4,5130.0,"[Adventure, Comedy, Drama]",,,,,30,Steven Spielberg,"[Charles Durning, Douglas Seale, Louis Giambal...",,AmblinEntertainment


In [288]:
actor_df.director.nunique()

26

In [77]:
# Find the mean rating, gross of all directors
actor_rating_mean = int(actor_df.rating.mean())
actor_gross_mean = int(actor_df.gross_usa.mean())

actor_rating_mean, actor_gross_mean

(6, 46460405)

In [279]:
def actor_value(actor,year):
    """
    input: actor name and (release) year of the movie of interest
    output: 
        - film_count: number of movies the actor was in before the movie of interest
        - avg_rating: average rating of movies the actor was in before the movie of interest
        - avg_gross: gross per movie before the movie of interest
    """
    
    aoi_df = actor_df[(actor_df.actor == actor) & (actor_df.year.dt.year < year)].copy()
    
    # Fill NaN with actor's mean
    values={'rating':aoi_df.rating.mean(), 'gross_usa':aoi_df.gross_usa.mean()}
    aoi_df.fillna(value=values,inplace=True)
    
    # If there's still NaN, fill with all actors' mean
    values={'rating':actor_rating_mean, 'gross_usa':actor_gross_mean}
    aoi_df.fillna(value=values,inplace=True)
        
    
    # If there's no movie prior to movie of interest  
    if aoi_df.shape[0] == 0:
        film_count,avg_rating,avg_gross = 0, actor_rating_mean, actor_gross_mean
        
    else:
        
        film_count = aoi_df.shape[0]
        
        avg_rating = aoi_df['rating'].mean()
        avg_gross = aoi_df['gross_usa'].mean()

        
    
    actor_value = [film_count, avg_rating, avg_gross]
    
    print(actor,actor_value)
    
    return actor_value,aoi_df

In [247]:
def get_cast(movie):
    """
    input: movie (each row in all_data_df)
    output: 
        - film_count: number of movies directed before the movie of interest
        - avg_rating: average rating of movies directed before the movie of interest
        - avg_gross: gross per movie before the movie of interest
    """
    
    movie_title = movie.movie_title
    year = movie.release_year
    actors = movie.actor
    lead = actors[0]
    
    film_counts = []
    ratings = []
    grosses = []
    
    for actor in actors:
        result = actor_value(actor,year)
        film_counts.append(result[0])
        ratings.append(result[1])
        grosses.append(result[2])
        
    avg_film_count = np.mean(film_counts)
    avg_rating = np.mean(ratings)
    avg_gross = np.mean(grosses)    
    
    
    lead_result = actor_value(lead,year)
    

    headers = ['movie_title','cast','avg_film_count_c','avg_rating_c','avg_gross_c',\
              'avg_film_count_l','avg_rating_l','avg_gross_l']

        
    
    cast_info = dict(zip(headers, [movie_title,actors,avg_film_count,avg_rating,avg_gross,\
                                  lead_result[0],lead_result[1],lead_result[2]]))
    
    return cast_info

In [248]:
# Create df including columns of director film_count, avg_rating, avg_gross
# Rows correspond to rows in all_df
movie_actor_df = all_df.iloc[:2].apply(get_cast,axis=1).apply(pd.Series)

Marlon Brando [3, 6.066666666666667, 37378936]
[3, 6.066666666666667, 37378936]
Al Pacino [3, 6.066666666666667, 37378936]
[3, 6.066666666666667, 37378936]
James Caan [3, 6.066666666666667, 37378936]
[3, 6.066666666666667, 37378936]
Marlon Brando [3, 6.066666666666667, 37378936]
Daniel Radcliffe [15, 5.840000000000001, 46460405]
[15, 5.840000000000001, 46460405]
Rupert Grint [15, 5.840000000000001, 46460405]
[15, 5.840000000000001, 46460405]
Richard Harris [15, 5.840000000000001, 46460405]
[15, 5.840000000000001, 46460405]
Daniel Radcliffe [15, 5.840000000000001, 46460405]


In [67]:
# Merge all_df and cast info
all_dc_df = pd.merge(all_d_df, movie_actor_df, lon=['movie_title','director'] ,how='left')

### 3. book and author popularity