In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
sb.set() 

In [3]:
movie_data = pd.read_csv('TMBD Movie Dataset.csv')

We will be using profit and popularity as the main factors of success

In [4]:
profit = pd.DataFrame(movie_data['profit'])
popularity = pd.DataFrame(movie_data['popularity'])

We have filtered out the factors that we identified to have relevance to the success of a movie 

In [5]:
budget = pd.DataFrame(movie_data['budget'])
cast = pd.DataFrame(movie_data['cast'])
director = pd.DataFrame(movie_data['director'])
genres = pd.DataFrame(movie_data['genres'])
runtime = pd.DataFrame(movie_data['runtime'])
production_companies = pd.DataFrame(movie_data['production_companies'])
release_year = pd.DataFrame(movie_data['release_year'])
release_date = pd.DataFrame(movie_data['release_date'])
allfactors = movie_data[['budget', 'cast', 'director', 'genres', 'production_companies', 'release_year', 'release_date', 'runtime', 'popularity', 'profit']]

Removing NaN values

In [6]:
allfactors.dropna()
allfactors = allfactors.reset_index(drop=True)
print(f"The shape of the new dataset: {allfactors.shape}")
allfactors.isnull().values.any()

The shape of the new dataset: (1287, 10)


False

In [7]:
allfactors[['year', 'month', 'day']] = allfactors['release_date'].str.split('-', expand=True)
allfactors.drop(['year', 'day'], axis=1, inplace=True)
factors = allfactors[['budget', 'cast', 'director', 'genres', 'production_companies', 'release_year', 'release_date', 'runtime', 'month']]
success = allfactors[['popularity', 'profit']]


In [13]:
cast["cast"] = cast["cast"].str.split("|")
cast = cast.explode("cast")
director["director"] = director["director"].str.split("|")
director = director.explode("director")
production_companies["production_companies"] = production_companies["production_companies"].str.split("|")
production_companies = production_companies.explode("production_companies")

In [9]:
def encodetable(y, separator): 
    options_list = []
    
    # iterate through data and find all available options
    for val in y:
        options = str(val).split(separator)
        options_list.append(options)
    
    # options_list is a list of list containing the avialable options
    # convert to single non-nested list &
    # convert that to set and back to list to remove redundant options
    options = list(set([val for option in options_list for val in option]))
    
    # sort the list so the DataFrame columns are sorted
    options.sort()
    
    # create an empty DataFrame with shape (len(y), len(options))
    df = pd.DataFrame(index=range(len(y)), columns=options)
    
    # intialize all values to 0
    for col in df.columns:
        df[col].values[:] = 0
    
    # set value to 1 if the option was selected
    for index, vals in enumerate(y):
        options = str(vals).split(separator)
        for val in options:
            df.at[index, val] = 1
    return df

In [10]:
encoded_genre = encodetable(factors['genres'], '|')
encoded_genre

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,Thriller,War,Western
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
3,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1283,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1284,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1285,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [None]:
factors.to_pickle('factors.pkl')
success.to_pickle('success.pkl')
cast.to_pickle('cast.pkl')
director.to_pickle('director.pkl')
encoded_genre.to_pickle('encoded_genre.pkl')
production_companies.to_pickle('production_companies.pkl')

In [12]:
factors

Unnamed: 0,budget,cast,director,genres,production_companies,release_year,release_date,runtime,month
0,150000000.0,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,2015,2015-06-09,124,06
1,150000000.0,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,2015,2015-05-13,120,05
2,110000000.0,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,2015,2015-03-18,119,03
3,200000000.0,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,2015,2015-12-15,136,12
4,190000000.0,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,2015,2015-04-01,137,04
...,...,...,...,...,...,...,...,...,...
1282,7000000.0,Roger Moore|Yaphet Kotto|Jane Seymour|Clifton ...,Guy Hamilton,Adventure|Action|Thriller,Eon Productions|Metro-Goldwyn-Mayer (MGM),1973,1973-07-05,121,07
1283,11000000.0,Sean Connery|Claudine Auger|Adolfo Celi|Lucian...,Terence Young,Adventure|Action|Thriller,Eon Productions|Metro-Goldwyn-Mayer (MGM),1965,2065-12-16,130,12
1284,7000000.0,George Lazenby|Diana Rigg|Telly Savalas|Gabrie...,Peter R. Hunt,Adventure|Action|Thriller,Eon Productions|Metro-Goldwyn-Mayer (MGM)|Danjaq,1969,2069-12-12,142,12
1285,300000.0,Donald Pleasence|Jamie Lee Curtis|P.J. Soles|N...,John Carpenter,Horror|Thriller,Compass International Pictures|Falcon Internat...,1978,1978-10-25,91,10
