In [1]:
import pandas as pd
import re

## Reading in data and looking at values - no changes

In [2]:
## Read in training data

movie_training_data = pd.read_csv('train.csv')

movie_training_data.head(3)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000


In [3]:
## Looking at the "belongs to collection" column values. Provides additional data about the collection, which might be useful
## for finding out how sequels impact revenue

movie_training_data.iloc[0,1]

"[{'id': 313576, 'name': 'Hot Tub Time Machine Collection', 'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg', 'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}]"

In [4]:
## Training Data is 3000 records, with 23 columns.

movie_training_data.shape

(3000, 23)

In [5]:
movie_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
id                       3000 non-null int64
belongs_to_collection    604 non-null object
budget                   3000 non-null int64
genres                   2993 non-null object
homepage                 946 non-null object
imdb_id                  3000 non-null object
original_language        3000 non-null object
original_title           3000 non-null object
overview                 2992 non-null object
popularity               3000 non-null float64
poster_path              2999 non-null object
production_companies     2844 non-null object
production_countries     2945 non-null object
release_date             3000 non-null object
runtime                  2998 non-null float64
spoken_languages         2980 non-null object
status                   3000 non-null object
tagline                  2403 non-null object
title                    3000 non-null object
Keywords             

In [6]:
## Not a lot of correlation between the initial numeric columns

corr_matrix = movie_training_data.corr()

corr_matrix.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,id,budget,popularity,runtime,revenue
id,1.0,0.0197323,-0.00747,0.0107498,0.000609564
budget,0.0197323,1.0,0.342356,0.238373,0.752965
popularity,-0.00747,0.342356,1.0,0.13369,0.46146
runtime,0.0107498,0.238373,0.13369,1.0,0.21638
revenue,0.000609564,0.752965,0.46146,0.21638,1.0


## Expanding features

#### Looking at film language - heavily skewed to English titles

In [7]:
language_pct_of_revenue = movie_training_data.groupby(['original_language'])['revenue'].sum()
language_pct_of_revenue = language_pct_of_revenue.to_frame().reset_index()

In [8]:
## Calculating total revenue, then figuring out the % of total revenue per language

total_rev = sum(language_pct_of_revenue.revenue)

language_pct_of_revenue['pct_of_total'] = ['%f'%(i/total_rev) for i in language_pct_of_revenue['revenue']]

In [9]:
## Sort values by language - English accounts for 96% of all revenue in our training data

language_pct_of_revenue.sort_values(['pct_of_total'], ascending = False).reset_index(drop = True).head()

Unnamed: 0,original_language,revenue,pct_of_total
0,en,192264714932,0.960471
1,zh,1337151014,0.00668
2,fr,1336340121,0.006676
3,ja,1134116556,0.005666
4,hi,1064547487,0.005318


#### Exploring cast - the idea that certain actors/actresses bring in revenue

In [10]:
def meta_data_search(regex, column_searched):
    name_search = re.compile(regex)
    return([name_search.findall(str(i)) for i in movie_training_data[column_searched]])

In [11]:
## Cast ID's look to be in descending order based on how they are credited

movie_training_data['cast_ids'] = meta_data_search("\'id\'\:\s(\d+)", "cast")

In [12]:
## Find actor/actress names and append them to a list - accounting for full names that contain between 1-4 names
## Backup option in case id# isn't reliable

movie_training_data['cast_names'] = meta_data_search("\'name\'\:\s\'(\w|\w+\s\w+|\w+\s\w+\s\w+|\w+\s\w+\s\w+\s\w+)\'", "cast")