# Movie_DataQuest - Search For The Best Single Factor

In [1]:
# Dependencies
import pandas as pd
import requests
import json
from pprint import pprint
import matplotlib.pyplot as plt
from scipy.stats import linregress
import random
import numpy as np
from config import api_key
from config import api_key_txt
from config import tmdb_api_key

### Project description and outline:

Description: How do various movie characteristics affect movie success?

Definition of Success:  Total U.S. Box Office Receipts

Characteristics Considered: 
  - Genre
  - Run time
  - Budget
  - Critical reviews
  - User reviews
  - Awards / nominations
  - MPAA Rating

Hypothesis: The closer box office receipts correlate with the various movie characteristics, 
the stronger the influence of the given characteristic on movie success.



### Steve's Code Goes Here

### Introduction to IMDb Database Preperation   
Movie Data Sets Utilized:  Internet Movie Database (IMDb) & Online Media Database (OMDb).

The IMDb contains a greater range of quantified movie characteristics than does the OMDb.  For 
this reason the IMDb was used to provide the bulk of our testing criteria.  In order to limit 
the scope of the data we pulled only information regarding movies that were feature-length, 
non-"adult", and were released between 2000 and 2019.


In [None]:
### INITIAL DATA SET ###

# Our first data set is a CSV of film data derived from the data available at https://www.imdb.com/interfaces/. 
# It contains feature-length films released between 2000 and 2019.

# Importing this CSV as a dataframe

imdb_csv = "Resources/trimmed_IMDB_data.csv"
imdb_df = pd.read_csv(imdb_csv, encoding="UTF-8")

imdb_df

In [None]:
### REFINING DATA SET ###

# Our IMDB data contains hundreds of thousands of movies, many of which had no theatrical release.
# We will combine this data with data constructed from https://movielens.org/, which focuses on released films.

# Importing movielens CSV as a dataframe

movielens_csv = "Resources/movielens_links.csv"
movielens_df =  pd.read_csv(movielens_csv, encoding="UTF-8")

movielens_df

In [None]:
# Combining the dataframes to find films that are in both

# Renaming themovielens_df "imdbTTID" column to match our imdb_df

movielens_df = movielens_df.rename(columns={"imdbTTID": "tconst"})

# Merging dataframes

big_df = imdb_df.merge(movielens_df, how="inner")

big_df


In [None]:
### GETTING ADDITIONAL DATA FROM TMDB ###

# Selecting a sample of films for the TMDB API call

# Assigning random number
for index, row in big_df.iterrows():
    random.seed(index)
    big_df.loc[index, "randomNumber"] = random.random()

In [None]:
# Sorting by random number to randomize
big_df = big_df.sort_values("randomNumber")

# Trimming to 5,000 entries

smaller_df = pd.DataFrame(big_df[:5000])

smaller_df

In [None]:
# Pulling additional data via a TMDB API call

# Setting base URL
url = "https://api.themoviedb.org/3/movie/"

# Looping through dataframe
for index, row in smaller_df.iterrows():
    movie_id = row["tmdbId"]
    
    full_url = url + str(movie_id) + "?api_key=" + tmdb_api_key
    movie_data = requests.get(full_url).json()

#Setting up error handling and adding data to dataframe

    try:
        smaller_df.loc[index, "Budget"] = movie_data["budget"]
    except:
        smaller_df.loc[index, "Budget"] = "N/A"
        
    try:
        smaller_df.loc[index, "Box Office"] = movie_data["revenue"]
    except:
        smaller_df.loc[index, "Box Office"] = "N/A"
    
smaller_df

In [None]:
### TRIMMING DATA FOR OMDB API CALLS ###

# Trimming dataframe to only items with Box Office data

boxoffice_df = smaller_df[smaller_df["Box Office"] != "N/A"]
boxoffice_df = boxoffice_df[boxoffice_df["Box Office"] != 0.0]

# Recasting Budget and Box Office as integers

boxoffice_df[['Budget', 'Box Office']] = boxoffice_df[['Budget', 'Box Office']].astype(int)

# Replacing missing values with Nan

boxoffice_df.replace(0, np.nan, inplace=True)

boxoffice_df

In [None]:
# Reducing to 900 items

sample_df = pd.DataFrame(boxoffice_df[:900])

# Exporting as CSV to preserve data between uses

sample_df.to_csv("Resources/sample.csv",index=False)

In [2]:
# Importing new dataframe from CSV to preserve data between uses

omdb_csv = "Resources/sample.csv"
omdb_api_df = pd.read_csv(omdb_csv, encoding="UTF-8")

omdb_api_df

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,movieId,imdbId,tmdbId,randomNumber,Budget,Box Office
0,tt6768578,Dogman,Dogman,2018,103,"Crime,Drama,Thriller",188675,6768578,483184.0,0.000094,,5080147
1,tt1637706,Our Idiot Brother,Our Idiot Brother,2011,90,"Comedy,Drama",88672,1637706,59968.0,0.000106,5000000.0,24816118
2,tt1486185,Red Riding Hood,Red Riding Hood,2011,100,"Fantasy,Horror,Mystery",85397,1486185,49730.0,0.000177,42000000.0,89162162
3,tt1373243,Tarif Novogodniy,Tarif Novogodniy,2008,83,"Comedy,Fantasy,Romance",171549,1373243,50530.0,0.000196,3200000.0,3877492
4,tt0337879,Blackball,Blackball,2003,96,"Comedy,Drama,Sport",32128,337879,15443.0,0.000279,,48000
...,...,...,...,...,...,...,...,...,...,...,...,...
895,tt1347007,Hidden,Skjult,2009,95,"Horror,Thriller",144412,1347007,24821.0,0.133336,2215773.0,1489526
896,tt0195714,Final Destination,Final Destination,2000,98,"Horror,Thriller",3409,195714,9532.0,0.133878,23000000.0,112880294
897,tt1457767,The Conjuring,The Conjuring,2013,112,"Horror,Mystery,Thriller",103688,1457767,138843.0,0.133941,13000000.0,319494638
898,tt0280486,Bad Company,Bad Company,2002,116,"Action,Comedy,Thriller",5414,280486,3132.0,0.134483,70000000.0,65977295


#### Conclusion/Segway
The size of the IMDb data set was....   And contained Genre, Run time, Budget, Critical Reviews, User Review, Awards/Nominations and MPAA Ratings.

After the database was acquired and cleaned it was merged with a sample of movies from the OM

### Jim's Code Goes Here

## OMDB Preparation:
    
Although the IMDb does not contain some of the movie characteristics we wished to test, it does contain
the box office information required for our analysis.  We will take a 900 movie sample from the IMDb and
merge it with matching movie information in the OMDb database to form the basis of our analysis.

In [3]:
# # Get a list of IMDB ids 
movie_ids = omdb_api_df['tconst'].tolist()
movie_ids

['tt6768578',
 'tt1637706',
 'tt1486185',
 'tt1373243',
 'tt0337879',
 'tt1375666',
 'tt9795368',
 'tt1446714',
 'tt2345759',
 'tt0324127',
 'tt0247638',
 'tt4044364',
 'tt4291600',
 'tt0342258',
 'tt0165982',
 'tt2948356',
 'tt4537362',
 'tt2194499',
 'tt2121382',
 'tt0381270',
 'tt0880578',
 'tt0238948',
 'tt1999987',
 'tt4129428',
 'tt1377278',
 'tt1926313',
 'tt1410063',
 'tt1410051',
 'tt0419749',
 'tt3508112',
 'tt7818580',
 'tt1270835',
 'tt0381681',
 'tt5938084',
 'tt0486655',
 'tt0266391',
 'tt1841642',
 'tt1179069',
 'tt0200550',
 'tt8359816',
 'tt5838806',
 'tt1216491',
 'tt0279112',
 'tt3569356',
 'tt2671706',
 'tt5221894',
 'tt1656186',
 'tt7016254',
 'tt1151309',
 'tt3064298',
 'tt1034415',
 'tt1313104',
 'tt0180093',
 'tt3179568',
 'tt2372678',
 'tt0492466',
 'tt7158430',
 'tt2823054',
 'tt1592525',
 'tt0449086',
 'tt4466894',
 'tt4520364',
 'tt0467197',
 'tt1235189',
 'tt6556670',
 'tt7967412',
 'tt0430912',
 'tt0390109',
 'tt0859635',
 'tt1423995',
 'tt2926810',
 'tt31

In [4]:
movie_name = []
imdb_id = []
awards = []
genre = []
rated = []
metascore =[]
imdb_rating = []
imdb_votes = []
rotten_tomatoes = []


# 'tt0090605', 'tt1285016'

In [5]:
# movie we want to search for.
url = "http://www.omdbapi.com/?i="
api_key_tx = "&apikey=" + api_key_txt
# response = requests.get(url+api_key_tx).json()
# print(json.dumps(response, indent=4, sort_keys=True))

In [27]:
# Performing a GET request

for id in movie_ids[0:2]:
    
    response = requests.get(url + id +  api_key_tx)
    data = response.json()
  
    try:
       
        movie_name.append(data['Title'])
        imdb_id.append(data['imdbID'])
        awards.append(data['Awards'])          #String response - not numerical
        genre.append(data['Genre'])            #Multiple genres per movie
        rated.append(data['Rated'])
        metascore.append(data['Metascore'])
        imdb_rating.append(data['imdbRating'])
        imdb_votes.append(data['imdbVotes'])
        rotten_tomatoes.append(data['Ratings']['Rotten Tomatoes'])  # Doesn't pull

    except:
        pass

In [25]:
# print(rotten_tomatoes)

[]


In [29]:
# Create new DataFrame

movie_df = pd.DataFrame({'Movie Name' : movie_name, 'IMDB ID': imdb_id, 'Awards' : awards,
                        'Genre' : genre, 'Rated' : rated, 'Metascore' : metascore, 'IMDB Rating' : imdb_rating, 'IMDB Votes' : imdb_votes,
                        'Rotten Tomatoes Rating' : rotten_tomatoes } )

In [None]:
movie_df


In [None]:
# Merge movie_df with omdb_api_df to get 900 movies by IMDB ID
movie_sample = pd.merge(omdb_api_df, movie_df, on = "imbdbid")
movie_sample

#### Conclusion/Segway
The size of the IMDb data set was....   And contained Genre, Run time, Budget, Critical Reviews, User Review, Awards/Nominations and MPAA Ratings.

Once our dataset was established we were ready to run our analysis.

Segway to Box Office vs. Runtime Correlations (Kelly's stuff)

### Kelly's Code Here

## Box Office vs. Runtime Correlations
In completing the box office/runtime correlation we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...

In [None]:
test_data_df = pd.DataFrame({"Movie Title": ["Ghostbusters", "Ghostbusters II", "Hocus Pocus"],
                             "Run Time": ["100", "110", "120"],
                             "Box Office Sales": ["200", "250", "300"],
                             "Budget": ["150", "250", "350"]})
test_data_df


In [None]:
# Correlate Box Office v. Run time


x_values = test_data_df["Box Office Sales"]
y_values = test_data_df["Run Time"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Run Time (in minutes)")
plt.title("Movie Run time vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

plt.show()


In [None]:
x_values = test_data_df["Box Office Sales"]
y_values = test_data_df["Run Time"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Run Time (in minutes)")
plt.title("Movie Run Time vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values, marker = "o", facecolors="purple", edgecolors="black")
plt.plot(x_values,regress_values,"r-")


plt.xlabel("Box Office Sales")
plt.ylabel("Movie Run Time")
plt.title("Movie Run Time vs Box Office Sales")
plt.annotate(line_eq,(10,15),fontsize=15,color="red")


print(f"The r-value is: {rvalue**2}")


plt.show()

In [None]:
# Correlate Box Office v. Budget

x_values = test_data_df["Box Office Sales"]
y_values = test_data_df["Budget"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Budget Amount")
plt.title("Movie Budget Amount vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "green", edgecolors="black")

plt.show()

# Awards / nominations
# MPAA Rating


In [None]:
x_values = test_data_df["Box Office Sales"]
y_values = test_data_df["Budget"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Budget Amount")
plt.title("Movie Budget Amount vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "green", edgecolors="black")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values, marker = "o", facecolors="green", edgecolors="black")
plt.plot(x_values,regress_values,"r-")


plt.xlabel("Box Office Sales")
plt.ylabel("Movie Run Time")
plt.title("Movie Run Time vs Box Office Sales")
plt.annotate(line_eq,(10,15),fontsize=15,color="red")


print(f"The r-value is: {rvalue**2}")


plt.show()

#### Conclusion/Segway
Conclusios about MPAA Rating analysis
Box Office vs. Critical Review's (Phillips' stuff)


### Philips' Code Goes Here

## Box Office vs. Critical Reviews 
In completing the box critica reviews we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...

In [None]:
# Correlate Box Office v. Critical reviews

In [None]:
# Correlate Box Office v. User reviews

#### Conclusion/Segway
Conclusios with  with Critical & User reviews analysis
Segway to Awards / Nominations (Jim's Stuff).




### Jim's Correlation Section Goes Here

## Box Office vs. Awards / Nominations
In completing the box office vs. awards / nominations we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...

In [None]:
test_df = pd.DataFrame({"Movie Title": ["True Lies", "Ghostbusters II", "Hocus Pocus"],
                             "Awards": ["3 nominations", "2 nominations", "5 nominations"],
                        "Box Office Receipts" :["32,000", "143,000", "75,000"]
                             })
# 
test_df['Box Office Receipts'] = pd.to_numeric(test_df['Box Office Receipts'], errors='coerce')
test_df

In [None]:
# Separate numbers from strings

test_df['Nominations'] = test_df['Awards'].str.extract('(\d+)').astype(float)
test_df

In [None]:
# Correlate Box Office v. Nominations

x_values = test_df["Nominations"]
y_values = test_df["Box Office Receipts"]
plt.xlabel("Nominations")
plt.ylabel("Box Office Receipts ($)")
plt.title("Award Nominations vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

plt.show()


In [None]:
# Do linear Regression and plot

x_values = test_df["Nominations"]
y_values = test_df["Box Office Receipts"]
plt.xlabel("Nominations")
plt.ylabel("Box Office Reciepts ($)")
plt.title("Award Nominations vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values, marker = "o", facecolors="purple", edgecolors="black")
plt.plot(x_values,regress_values,"r-")


plt.xlabel("Award Nominations")
plt.ylabel("Box Office Receipts")
plt.title("Award Nominations vs Box Office Receipts")
plt.annotate(line_eq,(10,15),fontsize=15,color="red")


print(f"The r-value is: {rvalue**2}")


plt.show()

#### Conclusion/Segway
Conclusios with  with Awards/Nominations analysis
Segway linear regression to T-testing final variables.

In [None]:
# option  Correlate Box Office v MPAA Rating
#genre

## Steve's T-test (Chi Square? ANOVA?)
I looked at movie genres & MPAA ratings.  With genres we look at 
correlations across cataegories and thus performed a different analysis
that linear regression correlations...

In [None]:
# option: Steve performs t-test across movie genre 
# and Jim does both Awards & MPAA Rating correlations

#### Conclusion/Segway
Conclusions of genre and MPAA analysis
Segway to project conclusion

## Project Conclusion
In completing the box office vs. awards / nominations we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...