#### Authors: Laurent Mundell, Justin Fleury
#### Date: 12/3/2019

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

# Genres average profit Margin 
Results: ~85% success rate in predicting tomorrow's rain outcome (still plenty of potential)
- The most important features to predict rain are locations and wind directions. Whether it rained today doesn't seem very important from the model (however it might just be correlated with another variable)¶

# Global/Misc Variable Declaration

In [64]:
regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]') 

#The features we'll keep in the ultimate dataframe
ultimate_features = ['tconst',  'start_year', 'runtime_minutes', 'title_CLEANED', 'averagerating',
                     'numvotes', 'directors', 'nconst', 'category', 'release_date', 'domestic_gross',
                     'production_budget','worldwide_gross', 'primary_name', 'genres_CLEANED', 'Action', 'Animation', 
                     'Comedy', 'Romance', 'Drama', 'Thriller', 'Western', 'Crime', 'Mystery', 'Sci-Fi', 'Fantasy',
                     'Horror', 'Adventure', 'Music', 'Biography', 'Sport', 'Family', 'War', 'History', 'Musical', 
                     'Documentary', 'News']
genres_labels = ['Action', 'Animation',  'Comedy', 'Romance', 'Drama', 'Thriller', 'Western', 'Crime', 'Mystery', 'Sci-Fi', 
          'Fantasy','Horror', 'Adventure', 'Music', 'Biography', 'Sport', 'Family', 'War', 'History', 'Musical', 
           'Documentary', 'News']

money = ['production_budget','domestic_gross','worldwide_gross']

# Methods/Functions

# Load Data

In [4]:
df_budgets = pd.read_csv('data/tn.movie_budgets.csv.gz') # GROSS & NAMES
df_imdb_basics = pd.read_csv('data/imdb.title.basics.csv.gz') # GENRES & NAMES
df_imdb_ratings = pd.read_csv('data/imdb.title.ratings.csv.gz') # T constant #n constant #rating
df_imdb_name = pd.read_csv('data/imdb.name.basics.csv.gz') # director/actor names, # n constant, # t constant
df_imdb_crew = pd.read_csv('data/imdb.title.crew.csv.gz') # Director n constant, t constant 
df_imdb_principals = pd.read_csv('data/imdb.title.principals.csv.gz')
df_tmdb = pd.read_csv('data/tmdb.movies.csv.gz') # POPULAITY, NAMES
df_rt_info = pd.read_csv('data/rt.movie_info.tsv.gz', delimiter = '\t') # Director names

# Visualize/Explore Data

### <span style='color:red'>------BUDGETS</span>

In [5]:
print(df_budgets.shape, '\n')
print(df_budgets.info())
df_budgets.head()

(5782, 6) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
id                   5782 non-null int64
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: int64(1), object(5)
memory usage: 271.1+ KB
None


Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


### <span style='color:red'>------IMDB_BASICS</span>

In [6]:
print(df_imdb_basics.shape, '\n')
print(df_imdb_basics.info())
df_imdb_basics.head()

(146144, 6) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             140736 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB
None


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


### <span style='color:red'>------IMDB_RATING</span>

In [7]:
print(df_imdb_ratings.shape, '\n')
print(df_imdb_ratings.info())
df_imdb_ratings.head()

(73856, 3) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
tconst           73856 non-null object
averagerating    73856 non-null float64
numvotes         73856 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB
None


Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


### <span style='color:red'>------IMDB_NAME</span>

In [8]:
print(df_imdb_name.shape, '\n')
print(df_imdb_name.info())
df_imdb_name.head()

(606648, 6) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 6 columns):
nconst                606648 non-null object
primary_name          606648 non-null object
birth_year            82736 non-null float64
death_year            6783 non-null float64
primary_profession    555308 non-null object
known_for_titles      576444 non-null object
dtypes: float64(2), object(4)
memory usage: 27.8+ MB
None


Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


### <span style='color:red'>------IMDB_CREW</span>

In [9]:
print(df_imdb_crew.shape, '\n')
print(df_imdb_crew.info())
df_imdb_crew.head()

(146144, 3) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 3 columns):
tconst       146144 non-null object
directors    140417 non-null object
writers      110261 non-null object
dtypes: object(3)
memory usage: 3.3+ MB
None


Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


### <span style='color:red'>------IMDB_PRINCIPALS</span>

In [10]:
print(df_imdb_principals.shape, '\n')
print(df_imdb_principals.info())
df_imdb_principals.head()

(1028186, 6) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
tconst        1028186 non-null object
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB
None


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


# Clean Data
(Parsing the names of movies to be joined later)

    -df_budgets
    -df_imdb_basics
    -df_tmdb

### <span style='color:red'>------BUDGETS</span>

In [11]:
df_budgets['title_CLEANED'] = df_budgets['movie'].apply(lambda x: x.lower())
df_budgets['title_CLEANED'] = df_budgets['title_CLEANED'].apply(lambda x: re.sub('[@_!#$%^&*().<>?/\|}{~:]', '', x))

### <span style='color:red'>------IMDB_BASICS</span>

In [12]:
df_imdb_basics['title_CLEANED'] = df_imdb_basics['original_title'].apply(lambda x: x.lower() if type(x) == str else '')
df_imdb_basics['title_CLEANED'] = df_imdb_basics['title_CLEANED'].apply(lambda x: re.sub('[@_!#$%^&*().<>?/\|}{~:]', '', x))

# Merge Data
(Merging the data by T constant values)

In [13]:
df_basics_ratings_crew_principals = pd.merge(df_imdb_basics, df_imdb_ratings,  on="tconst")
df_basics_ratings_crew_principals = pd.merge(df_basics_ratings_crew_principals, df_imdb_crew,  on="tconst")
df_basics_ratings_crew_principals = pd.merge(df_basics_ratings_crew_principals, df_imdb_principals,  on="tconst")
print(df_basics_ratings_crew_principals.shape, '\n')
print(df_basics_ratings_crew_principals.info())
df_basics_ratings_crew_principals.head()

(629755, 16) 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 629755 entries, 0 to 629754
Data columns (total 16 columns):
tconst             629755 non-null object
primary_title      629755 non-null object
original_title     629755 non-null object
start_year         629755 non-null int64
runtime_minutes    567802 non-null float64
genres             624615 non-null object
title_CLEANED      629755 non-null object
averagerating      629755 non-null float64
numvotes           629755 non-null int64
directors          626240 non-null object
writers            558935 non-null object
ordering           629755 non-null int64
nconst             629755 non-null object
category           629755 non-null object
job                124945 non-null object
characters         248129 non-null object
dtypes: float64(2), int64(3), object(11)
memory usage: 81.7+ MB
None


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,title_CLEANED,averagerating,numvotes,directors,writers,ordering,nconst,category,job,characters
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",sunghursh,7.0,77,nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10,nm0006210,composer,,
1,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",sunghursh,7.0,77,nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",1,nm0474801,actor,,"[""Kundan S. Prasad"",""Bajrangi""]"
2,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",sunghursh,7.0,77,nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",2,nm0904537,actress,,"[""Munni"",""Laila-E-Aasmaan""]"
3,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",sunghursh,7.0,77,nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",3,nm0756379,actor,,"[""Ganeshi N. Prasad""]"
4,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",sunghursh,7.0,77,nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",4,nm0474876,actor,,"[""Dwarka N. Prasad""]"


(Merging the data by T constant values)

In [27]:
df_final = pd.merge(df_basics_ratings_crew_principals, df_budgets,  on="title_CLEANED")
df_final = pd.merge(df_final, df_imdb_name,  on="nconst")
print(df_final.shape, '\n')
print(df_final.info())

(26024, 27) 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26024 entries, 0 to 26023
Data columns (total 27 columns):
tconst                26024 non-null object
primary_title         26024 non-null object
original_title        26024 non-null object
start_year            26024 non-null int64
runtime_minutes       25127 non-null float64
genres                25974 non-null object
title_CLEANED         26024 non-null object
averagerating         26024 non-null float64
numvotes              26024 non-null int64
directors             25977 non-null object
writers               25104 non-null object
ordering              26024 non-null int64
nconst                26024 non-null object
category              26024 non-null object
job                   8726 non-null object
characters            10534 non-null object
id                    26024 non-null int64
release_date          26024 non-null object
movie                 26024 non-null object
production_budget     26024 non-null object


# Cleaning/Parsing Part 2!
(Cleaning the fully merged dataframe!)

In [34]:
df_final['genres_CLEANED'] = df_final['genres']
df_final = df_final.dropna(subset = ['genres_CLEANED'])
df_final['genres_CLEANED'] = df_final['genres_CLEANED'].apply(lambda x: x.split(','))

In [35]:
#Get all unique genres.
genres = {}
for index, value in df_final['genres_CLEANED'].items():
    if(type(value)) == list:
        for _ in value:
            if _ not in genres:
                genres[_] = True

# Adding columns for every genre. values are all false
for genre in list(genres.keys()):
    df_final[genre] = False
    
for genre in list(genres.keys()):
    df_final[genre] = df_final['genres_CLEANED'].apply(lambda x: True if genre in x else False)

In [36]:
# Remove Unwanted Features and be the final version of the dataframe
df_ultimate = df_final[ultimate_features]

In [37]:
for item in money:
     df_ultimate[f"{item}_CLEANED"] = df_ultimate[item].apply(lambda x: float(x.replace('$','').replace(',','_')))
df_ultimate['profit_margin'] = df_ultimate.worldwide_gross_CLEANED*100/df_ultimate.production_budget_CLEANED

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [80]:
df_ultimate.head()

Unnamed: 0,tconst,start_year,runtime_minutes,title_CLEANED,averagerating,numvotes,directors,nconst,category,release_date,...,Family,War,History,Musical,Documentary,News,production_budget_CLEANED,domestic_gross_CLEANED,worldwide_gross_CLEANED,profit_margin
0,tt0249516,2012,91.0,foodfight,1.9,8248,nm0440415,nm0257258,producer,"Dec 31, 2012",...,False,False,False,False,False,False,45000000.0,0.0,73706.0,0.163791
1,tt0249516,2012,91.0,foodfight,1.9,8248,nm0440415,nm0240381,actress,"Dec 31, 2012",...,False,False,False,False,False,False,45000000.0,0.0,73706.0,0.163791
2,tt0249516,2012,91.0,foodfight,1.9,8248,nm0440415,nm0240380,actress,"Dec 31, 2012",...,False,False,False,False,False,False,45000000.0,0.0,73706.0,0.163791
3,tt0249516,2012,91.0,foodfight,1.9,8248,nm0440415,nm0000221,actor,"Dec 31, 2012",...,False,False,False,False,False,False,45000000.0,0.0,73706.0,0.163791
4,tt0249516,2012,91.0,foodfight,1.9,8248,nm0440415,nm0519456,actress,"Dec 31, 2012",...,False,False,False,False,False,False,45000000.0,0.0,73706.0,0.163791


In [87]:
for i in genres_labels:
    print(i, df_ultimate.groupby(i).mean()['profit_margin'][1]) 

Action 335.936835028575
Animation 414.7936075654013
Comedy 354.8172848326081
Romance 397.27097372554454
Drama 368.7339804452555
Thriller 587.972700484635
Western 121.54909646339661
Crime 230.16882437468828
Mystery 776.0593049829189
Sci-Fi 303.2017814495809
Fantasy 422.06551508963724
Horror 913.7171733851573
Adventure 312.9679877109322
Music 328.7618969452663
Biography 411.34715418806127
Sport 368.1059352242902
Family 446.32472800094325
War 164.26437342130504
History 352.65006823760234
Musical 323.5896826096663
Documentary 377.5422852602764
News 153.97579607206018


In [88]:
df_ultimate.groupby(i).mean()['rating'] 

KeyError: 'rating'

##### Visual Analysis 

In [None]:
sns.pairplot(df_ultimate)
plt.show()

In [None]:
ax = df_ultimate.plot.bar(x='lab', y='profit_margin', rot=0)

# Deliverables
    (Presentation Software/Resource)
    -RISE (Jupyter plug in)
    -Reveal.js.slides (Jupyter notebook download option)
    -Canva
    -Power point
    -Google Slides
    
    (DOs and DON'Ts)
    -DO: Short text. (To keep the viewer attention)
    -DON'T: Small sentences
    -DO: Bonues Slides
    -DON'T
    -DO: Graphics
    -DON'T: Text
    -DON'T: Use feature names in presentation.
    -DO: Readme(Let them read the most important thing)