## I imported all the necesary data and downloaded a file of profit data for movies 

In [521]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import json
import sqlite3
import numpy as np
%matplotlib inline

thenums_df = pd.read_csv('data/zippedData/tn.movie_budgets.csv.gz')


## I made a function that turned object columns into integer colums ###

I ran this function onto the columns of this dataframe


In [312]:
# This function turns series columns into integers
def make_column_integer(df, series):
    df[series] = df[series].str.replace('$', '')
    df[series] = df[series].str.replace(',', '')
    df[series] = df[series].astype(int)

# Turns all the columns in this dataframe into integers
make_column_integer(thenums_df, 'production_budget')
make_column_integer(thenums_df, 'domestic_gross')
make_column_integer(thenums_df, 'worldwide_gross')


## I then made 2 new columns: 

One column called "total_gross" which was the sum of the domestic and worldwide grossings, and another column called "total_profit" that is the total gross minus the production cost.



In [313]:
thenums_df['total_gross']= (thenums_df['domestic_gross'])+(thenums_df['worldwide_gross'])
thenums_df['total_profit'] = (thenums_df['total_gross']-thenums_df['production_budget'])
thenums_df.sort_values('total_profit', ascending = False, inplace=True)
thenums_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,total_gross,total_profit
0,1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279,3536852904,3111852904
5,6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,2989973445,2683973445
42,43,"Dec 19, 1997",Titanic,200000000,659363944,2208208395,2867572339,2667572339
6,7,"Apr 27, 2018",Avengers: Infinity War,300000000,678815482,2048134200,2726949682,2426949682
33,34,"Jun 12, 2015",Jurassic World,215000000,652270625,1648854864,2301125489,2086125489
...,...,...,...,...,...,...,...,...
480,81,"Dec 31, 2019",Army of the Dead,90000000,0,0,0,-90000000
479,80,"Dec 13, 2017",Bright,90000000,0,0,0,-90000000
341,42,"Jun 14, 2019",Men in Black: International,110000000,3100000,3100000,6200000,-103800000
194,95,"Dec 31, 2020",Moonfall,150000000,0,0,0,-150000000


## I imported a new dataframe from the IMDB database that contained the genres of the movies.

In [314]:
imdb_title_basics = pd.read_csv('data/zippedData/imdb.title.basics.csv.gz')

In [315]:
imdb_title_basics

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.0,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
146142,tt9916730,6 Gunn,6 Gunn,2017,116.0,




## I merged the profits dataframe and the title basics dataframe



In [501]:
gandp_df = pd.merge(imdb_title_basics, thenums_df, how='inner', left_on='primary_title', right_on='movie')

In [502]:
# Sorted the values by hightest profit
gandp_df.sort_values('total_profit', ascending=False, inplace=True)

# Drops redundant columns
gandp_df = gandp_df.drop(columns = ['original_title', 'start_year', 'id', 'movie'])

In [503]:
# Drops the duplicated movies from the dataframe
gandp_df = gandp_df.drop_duplicates(subset='primary_title')

In [504]:
#Sets the index to their tconst
gandp_df.set_index('tconst', inplace=True)

In [505]:
#Drops all null values from the genres catagory
gandp_df.dropna(subset=['genres'], inplace=True)

In [506]:
gandp_df.at['tt1775309', 'genres'] = 'Action,Adventure,Fantasy'
gandp_df

Unnamed: 0_level_0,primary_title,runtime_minutes,genres,release_date,production_budget,domestic_gross,worldwide_gross,total_gross,total_profit
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
tt1775309,Avatar,93.0,"Action,Adventure,Fantasy","Dec 18, 2009",425000000,760507625,2776345279,3536852904,3111852904
tt8852130,Titanic,115.0,Family,"Dec 19, 1997",200000000,659363944,2208208395,2867572339,2667572339
tt4154756,Avengers: Infinity War,149.0,"Action,Adventure,Sci-Fi","Apr 27, 2018",300000000,678815482,2048134200,2726949682,2426949682
tt0369610,Jurassic World,124.0,"Action,Adventure,Sci-Fi","Jun 12, 2015",215000000,652270625,1648854864,2301125489,2086125489
tt0848228,The Avengers,143.0,"Action,Adventure,Sci-Fi","May 4, 2012",225000000,623279547,1517935897,2141215444,1916215444
...,...,...,...,...,...,...,...,...,...
tt7504726,Call of the Wild,,"Adventure,Animation,Family","Feb 21, 2020",82000000,0,0,0,-82000000
tt1305591,Mars Needs Moms,88.0,"Adventure,Animation,Family","Mar 11, 2011",150000000,21392758,39549758,60942516,-89057484
tt5519340,Bright,117.0,"Action,Crime,Fantasy","Dec 13, 2017",90000000,0,0,0,-90000000
tt2283336,Men in Black: International,115.0,"Action,Adventure,Comedy","Jun 14, 2019",110000000,3100000,3100000,6200000,-103800000


## I made a list of all possible genres for each movie

In [510]:
# Puts all the genres for each movie into a list
list_of_genres = gandp_df['genres']
list_of_genres = list_of_genres.str.split(',')

In [511]:
nunique_genres=[]
for x in list_of_genres:
    for genre in x:
        nunique_genres.append(genre)

unique_genres = set(nunique_genres)
unique_genres

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western'}

## I cleaned up and seperated the total movie database
### I moved all the movies with no gross earnings into a database called "stream_movies", and I took all the others and named the new database "theater_movies"

In [512]:
stream_movies = gandp_df.loc[gandp_df['total_gross']== 0]
stream_movies = stream_movies.sort_values(by='production_budget', ascending=False)
stream_movies = stream_movies.drop(['domestic_gross','worldwide_gross','total_gross','total_profit'], axis='columns')

In [513]:
theater_movies = gandp_df[gandp_df.total_gross != 0]

## I then broke up the dataframe into sub-dataframes, for each genre we are analyzing.
#### I then broke up the genre dataframes into theatrical releases and streaming releases


In [526]:
action_movies= theater_movies[theater_movies.genres.str.contains('Action' or 'Adventure')]
drama_movies= theater_movies[theater_movies.genres.str.contains('Drama')]
comedy_movies= theater_movies[theater_movies.genres.str.contains('Comedy')]
horror_movies= theater_movies[theater_movies.genres.str.contains('Horror')]

In [527]:
action_movies_st= stream_movies[stream_movies.genres.str.contains('Action' or 'Adventure')]
drama_movies_st= stream_movies[stream_movies.genres.str.contains('Drama')]
comedy_movies_st= stream_movies[stream_movies.genres.str.contains('Comedy')]
horror_movies_st= stream_movies[stream_movies.genres.str.contains('Horror')]

## This WebScrapes the taglines from IMDb for each movie
### It then organizes the first five taglines into a list for each movie

In [533]:
def get_tags(data):
    no_table = []
    list_of_tags = []
    for tconst in data.index:
        url = f'https://www.imdb.com/title/{tconst}/keywords?ref_=tt_stry_kw'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        
        try:
            table = soup.find_all('table', {'class': 'dataTable'})[0]
            a_tags = [x.find('a') for x in table.find_all('td', {'class': 'soda sodavote'})]
            if len([x.text for x in a_tags]) > 5:
                text = [x.text for x in a_tags][:5]
            else:
                text = [x.text for x in a_tags]
        
            list_of_tags.append(text)
        
        except:
            no_table.append(tconst)
            list_of_tags.append([None])
  
    return list_of_tags

In [590]:
get_tags(horror_movies)

[['killer clown',
  'based on novel',
  'evil clown',
  'balloon',
  'supernatural power'],
 [None],
 ['nova scotia', 'zombie', 'walled city', 'severed arm', 'south korea'],
 ['survival', 'sign language', 'deafness', 'silence', 'post apocalypse'],
 ['love interest',
  'chinawoman',
  'tattooed trash',
  'cleavage',
  'prehistoric animal']]

In [None]:
get_tags(action_movies)

In [None]:
get_tags(comedy_movies)

In [None]:
get_tags()