# Movie_DataQuest - Search For The Best Single Factor

In [1]:
# Dependencies
import pandas as pd
import requests
import json
from pprint import pprint
import matplotlib.pyplot as plt
from scipy.stats import linregress
import random
import numpy as np
from config import api_key_txt
#from config import tmdb_api_key

ModuleNotFoundError: No module named 'config'

### Project description and outline:

Description: How do various movie characteristics affect movie success?

Definition of Success:  Total U.S. Box Office Receipts

Characteristics Considered: 
  - Genre
  - Run time
  - Budget
  - Critical reviews
  - User reviews
  - Awards / nominations
  - MPAA Rating

Hypothesis: The closer box office receipts correlate with the various movie characteristics, 
the stronger the influence of the given characteristic on movie success.



### Steve's Code Goes Here

### Introduction to IMDb Database Preperation   
Movie Data Sets Utilized:  Internet Movie Database (IMDb) & Online Media Database (OMDb).

The IMDb contains a greater range of quantified movie characteristics than does the OMDb.  For 
this reason the IMDb was used to provide the bulk of our testing criteria.  In order to limit 
the scope of the data we pulled only information regarding movies that were feature-length, 
non-"adult", and were released between 2000 and 2019.


In [2]:
### INITIAL DATA SET ###

# Our first data set is a CSV of film data derived from the data available at https://www.imdb.com/interfaces/. 
# It contains feature-length films released between 2000 and 2019.

# Importing this CSV as a dataframe

imdb_csv = "Resources/trimmed_IMDB_data.csv"
imdb_df = pd.read_csv(imdb_csv, encoding="UTF-8")

imdb_df

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt0011216,Spanish Fiesta,La fête espagnole,2019,67,Drama
1,tt0016906,Frivolinas,Frivolinas,2014,80,"Comedy,Musical"
2,tt0018295,El puño de hierro,El puño de hierro,2004,40,"Action,Drama"
3,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance"
4,tt0036177,Muhomatsu no issho,Muhomatsu no issho,2008,100,"Action,Adventure"
...,...,...,...,...,...,...
180341,tt9916186,Illenau - die Geschichte einer ehemaligen Heil...,Illenau - die Geschichte einer ehemaligen Heil...,2017,84,Documentary
180342,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123,Drama
180343,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57,Documentary
180344,tt9916680,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary


In [3]:
### REFINING DATA SET ###

# Our IMDB data contains hundreds of thousands of movies, many of which had no theatrical release.
# We will combine this data with data constructed from https://movielens.org/, which focuses on released films.

# Importing movielens CSV as a dataframe

movielens_csv = "Resources/movielens_links.csv"
movielens_df =  pd.read_csv(movielens_csv, encoding="UTF-8")

movielens_df

Unnamed: 0,movieId,imdbId,tmdbId,imdbTTID
0,88674,8,105158.0,tt0000008
1,140539,3,88013.0,tt0000003
2,172063,1,16612.0,tt0000001
3,180695,7,159895.0,tt0000007
4,98981,12,160.0,tt0000012
...,...,...,...,...
62418,209051,11108064,642749.0,tt11108064
62419,209085,10768348,631420.0,tt10768348
62420,209133,10192640,519334.0,tt10192640
62421,209145,10199670,595924.0,tt10199670


In [4]:
# Combining the dataframes to find films that are in both

# Renaming themovielens_df "imdbTTID" column to match our imdb_df

movielens_df = movielens_df.rename(columns={"imdbTTID": "tconst"})

# Merging dataframes

big_df = imdb_df.merge(movielens_df, how="inner")

big_df


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,movieId,imdbId,tmdbId
0,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",4992,35423,11232.0
1,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122,Drama,159163,69049,299782.0
2,tt0110476,Master i Margarita,Master i Margarita,2006,125,"Drama,Fantasy",167420,110476,63163.0
3,tt0113026,The Fantasticks,The Fantasticks,2000,86,"Musical,Romance",4193,113026,62127.0
4,tt0114722,3 noches,3 noches,2001,105,"Crime,Thriller",185453,114722,276251.0
...,...,...,...,...,...,...,...,...,...
27873,tt9866700,Paranormal Investigation,Paranormal Investigation,2018,92,"Horror,Thriller",200550,9866700,585997.0
27874,tt9872556,Ordinary Happiness,Momenti di trascurabile felicità,2019,93,"Comedy,Fantasy",200102,9872556,583029.0
27875,tt9876160,Convoy 48,Koridor bessmertiya,2019,140,"Drama,War",203980,9876160,575333.0
27876,tt9900060,Lupin the Third: Fujiko Mine's Lie,Lupin the IIIrd: Mine Fujiko no Uso,2019,58,"Adventure,Animation,Crime",205539,9900060,587870.0


In [5]:
### GETTING ADDITIONAL DATA FROM TMDB ###

# Selecting a sample of films for the TMDB API call

# Assigning random number
for index, row in big_df.iterrows():
    random.seed(index)
    big_df.loc[index, "randomNumber"] = random.random()

In [6]:
# Sorting by random number to randomize
big_df = big_df.sort_values("randomNumber")

# Trimming to 5,000 entries

smaller_df = pd.DataFrame(big_df[:5000])

smaller_df

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,movieId,imdbId,tmdbId,randomNumber
22338,tt4545832,Les oiseaux de passage,Les oiseaux de passage,2015,84,Family,154270,4545832,332880.0,0.000013
12214,tt1592265,Maladies,Maladies,2012,96,Drama,137982,1592265,167153.0,0.000088
26073,tt6768578,Dogman,Dogman,2018,103,"Crime,Drama,Thriller",188675,6768578,483184.0,0.000094
12515,tt1637706,Our Idiot Brother,Our Idiot Brother,2011,90,"Comedy,Drama",88672,1637706,59968.0,0.000106
18883,tt3130302,Sucker,Sucker,2015,90,Comedy,172185,3130302,349177.0,0.000122
...,...,...,...,...,...,...,...,...,...,...
23189,tt4974396,Stretch and Bobbito: Radio That Changed Lives,Stretch and Bobbito: Radio That Changed Lives,2015,99,"Documentary,Music",158491,4974396,362268.0,0.175399
27654,tt8908002,Luka Chuppi,Luka Chuppi,2019,126,"Comedy,Romance",199436,8908002,585062.0,0.175405
23044,tt4902260,The Eyes,The Eyes,2017,95,"Crime,Drama,Mystery",187079,4902260,450975.0,0.175408
7485,tt0892318,Letters to Juliet,Letters to Juliet,2010,105,"Adventure,Comedy,Drama",78316,892318,37056.0,0.175422


In [7]:
# Pulling additional data via a TMDB API call

# Setting base URL
url = "https://api.themoviedb.org/3/movie/"

# Looping through dataframe
for index, row in smaller_df.iterrows():
    movie_id = row["tmdbId"]
    
    full_url = url + str(movie_id) + "?api_key=" + tmdb_api_key
    movie_data = requests.get(full_url).json()

#Setting up error handling and adding data to dataframe

    try:
        smaller_df.loc[index, "Budget"] = movie_data["budget"]
    except:
        smaller_df.loc[index, "Budget"] = "N/A"
        
    try:
        smaller_df.loc[index, "Box Office"] = movie_data["revenue"]
    except:
        smaller_df.loc[index, "Box Office"] = "N/A"
    
smaller_df

NameError: name 'tmdb_api_key' is not defined

In [None]:
### TRIMMING DATA FOR OMDB API CALLS ###

# Trimming dataframe to only items with Box Office data

boxoffice_df = smaller_df[smaller_df["Box Office"] != "N/A"]
boxoffice_df = boxoffice_df[boxoffice_df["Box Office"] != 0.0]

# Recasting Budget and Box Office as integers

boxoffice_df[['Budget', 'Box Office']] = boxoffice_df[['Budget', 'Box Office']].astype(int)

# Replacing missing values with Nan

boxoffice_df.replace(0, np.nan, inplace=True)

boxoffice_df

In [None]:
# Reducing to 900 items

sample_df = pd.DataFrame(boxoffice_df[:900])

# Exporting as CSV to preserve data between uses

sample_df.to_csv("Resources/sample.csv",index=False)

In [None]:
# Importing new dataframe from CSV to preserve data between uses

omdb_csv = "Resources/sample.csv"
omdb_api_df = pd.read_csv(omdb_csv, encoding="UTF-8")

omdb_api_df

#### Conclusion/Segway
The size of the IMDb data set was....   And contained Genre, Run time, Budget, Critical Reviews, User Review, Awards/Nominations and MPAA Ratings.

After the database was acquired and cleaned it was merged with a sample of movies from the OM

### Jim's Code Goes Here

## OMDB Preparation:
    
Although the IMDb does not contain some of the movie characteristics we wished to test, it does contain
the box office information required for our analysis.  We will take a 900 movie sample from the IMDb and
merge it with matching movie information in the OMDb database to form the basis of our analysis.

In [None]:
# # Get a list of IMDB ids 
movie_ids = omdb_api_df['tconst'].tolist()
movie_ids

In [8]:
tconst = []
rated = []
metascore =[]
imdb_rating = []
imdb_votes = []
rotten_tomatoes = []


# 'tt0090605', 'tt1285016'

In [9]:
# movie we want to search for.
url = "http://www.omdbapi.com/?i="
api_key_tx = "&apikey=" + api_key_txt
# response = requests.get(url+api_key_tx).json()
# print(json.dumps(response, indent=4, sort_keys=True))

NameError: name 'api_key_txt' is not defined

In [10]:
# Performing a GET request

for id in movie_ids[0:10]:
    
    response = requests.get(url + id +  api_key_tx)
    data = response.json()
    tconst.append(id)
        
        
    try:
        rated.append(data['Rated'])
    except:
        rated.append('N/A')

    try:
        metascore.append(data['Metascore'])
    except:
        metascore.append('N/A')

    try:
        imdb_rating.append(data['imdbRating'])
    except:
        imdb_rating.append('N/A')

    try:
        imdb_votes.append(data['imdbVotes'])
    except:
        imdb_votes.append('N/A')

    rIndex = 0
    rDone = 0
    rtScore = ""
    try:
        for x in data['Ratings']:
            if data["Ratings"][rIndex]["Source"] == "Rotten Tomatoes":
                rtScore = data['Ratings'][rIndex]["Value"]
                rIndex += 1
                rDone = 1
            else:
                if rDone == 0:
                    rIndex += 1
                    rtScore = 'N/A'
                else:
                    pass
        rotten_tomatoes.append(rtScore)
        
    except:
        rotten_tomatoes.append('N/A')
        


NameError: name 'movie_ids' is not defined

In [11]:
# Create new DataFrame

movie_df = pd.DataFrame({'tconst' : tconst, 'Rated' : rated, 'Metascore' :  metascore,  'IMDB Rating' : imdb_rating, 
                         'IMDB Votes' : imdb_votes, "Rotten Tomatoes" : rotten_tomatoes })
                         
movie_df["Metascore"] = movie_df["Metascore"].replace('N/A',np.nan)
movie_df["Metascore"] = movie_df["Metascore"].astype(float)

movie_df["IMDB Rating"] = movie_df["IMDB Rating"].replace('N/A',np.nan)
movie_df["IMDB Rating"] = movie_df["IMDB Rating"].astype(float)

movie_df["IMDB Votes"] = movie_df["IMDB Votes"].replace(',',"", regex=True).replace('N/A',np.nan)
movie_df["IMDB Votes"] = movie_df["IMDB Votes"].astype(float)

movie_df["Rotten Tomatoes"] = movie_df["Rotten Tomatoes"].replace('%',"", regex=True).replace('N/A',np.nan)
movie_df["Rotten Tomatoes"] = movie_df["Rotten Tomatoes"].astype(float)
    
movie_df.dtypes


tconst             float64
Rated              float64
Metascore          float64
IMDB Rating        float64
IMDB Votes         float64
Rotten Tomatoes    float64
dtype: object

In [12]:
# Merge movie_df with omdb_api_df to get 900 movies by IMDB ID
movie_sample = pd.merge(omdb_api_df, movie_df, on = "tconst")
movie_sample

NameError: name 'omdb_api_df' is not defined

#### Conclusion/Segway
The size of the IMDb data set was....   And contained Genre, Run time, Budget, Critical Reviews, User Review, Awards/Nominations and MPAA Ratings.

Once our dataset was established we were ready to run our analysis.

Segway to Box Office vs. Runtime Correlations (Kelly's stuff)

### Kelly's Code Here

## Box Office vs. Runtime Correlations
In completing the box office/runtime correlation we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...

In [13]:
# Correlate Box Office v. Run time


x_values = movie_sample["Box Office"]
y_values = movie_sample["runtimeMinutes"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Run Time (in minutes)")
plt.title("Movie Run time vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

plt.show()


NameError: name 'movie_sample' is not defined

In [14]:
x_values = movie_sample["Box Office"]
y_values = movie_sample["runtimeMinutes"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Run Time (in minutes)")
plt.title("Movie Run Time vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values, marker = "o", facecolors="purple", edgecolors="black")
plt.plot(x_values,regress_values,"r-")


plt.xlabel("Box Office Sales")
plt.ylabel("Movie Run Time")
plt.title("Movie Run Time vs Box Office Sales")
plt.annotate(line_eq,(8,130),fontsize=15,color="red")


print(f"The r-value is: {rvalue**2}")


plt.show()

NameError: name 'movie_sample' is not defined

In [15]:
# Correlate Box Office v. Budget

x_values = movie_sample["Box Office"]
y_values = movie_sample["Budget"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Budget Amount")
plt.title("Movie Budget Amount vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "green", edgecolors="black")

plt.show()




NameError: name 'movie_sample' is not defined

In [16]:
x_values = movie_sample["Box Office"]
y_values = movie_sample["Budget"]
plt.xlabel("Box Office Sales")
plt.ylabel("Movie Budget Amount")
plt.title("Movie Budget Amount vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "green", edgecolors="black")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values, marker = "o", facecolors="green", edgecolors="black")
plt.plot(x_values,regress_values,"r-")


plt.xlabel("Box Office Sales")
plt.ylabel("Budget Amount")
plt.title("Movie Budget Amount vs Box Office Sales")
plt.annotate(line_eq,(4,15),fontsize=15,color="red")


print(f"The r-value is: {rvalue**2}")


plt.show()

NameError: name 'movie_sample' is not defined

#### Conclusion/Segway
Conclusios about MPAA Rating analysis
Box Office vs. Critical Review's (Phillips' stuff)


### Philips' Code Goes Here

## Box Office vs. Critical Reviews 
In completing the box critica reviews we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...

In [17]:
movie_sample

NameError: name 'movie_sample' is not defined

In [None]:
#Correlate Box Office v. Metascore

#Plot out Box Office versus Metascore
x_values = movie_sample["Box Office"]
y_values = movie_sample["Metascore"]

plt.scatter(x_values, y_values, marker = "o", facecolors = "blue", edgecolors="black")
plt.xlabel("Box Office Sales")
plt.ylabel("Metascore")
plt.title("Box Office Sales vs. Metascore")

plt.show()


In [None]:
#Add the linear regression equation and line to plot
x_values = movie_sample["Box Office"]
y_values = movie_sammple["Metascore"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values)
plt.plot(x_values,regress_values,"r-")

plt.xlabel("Box Office Sales")
plt.ylabel("Metascore")
plt.title("Box Office Sales vs. Metascore")
plt.annotate(line_eq,(10,15),fontsize=15,color="red")

print(f"The r-value is: {rvalue**2}")

plt.show()


In [None]:
#Correlate Box Office v. Rotten Tomatoes Score

#Plot out box office versus user reviews
x_values = movie_sample["Box Office"]
y_values = movie_sample["Rotten Tomatoes"]

plt.scatter(x_values, y_values, marker = "o", facecolors = "blue", edgecolors="black")
plt.xlabel("Box Office Sales")
plt.ylabel("Rotten Tomatoes Score")
plt.title("Box Office Sales vs. Rotten Tomatoes Score")

plt.show()


In [None]:
#Add the linear regression equation and line to plot
x_values = movie_sample["Box Office"]
y_values = movie_sample["Rotten Tomatoes"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values)
plt.plot(x_values,regress_values,"r-")

plt.xlabel("Box Office Sales")
plt.ylabel("Rotten Tomatoes Score")
plt.title("Box Office Sales vs. Rotten Tomatoes Score")

plt.annotate(line_eq,(10,15),fontsize=15,color="red")
print(f"The r-value is: {rvalue**2}")

plt.show()


#### Conclusion/Segway
Conclusios with  with Critical & User reviews analysis
Segway to Awards / Nominations (Jim's Stuff).




### Jim's Correlation Section Goes Here

## Box Office vs. Awards / Nominations
In completing the box office vs. awards / nominations we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...

In [None]:
x_values = float(movie_sample["IMDB Rating"])
y_values = float(movie_sample["Box Office"])
plt.xlabel("IMDB Ratings")
plt.ylabel("Box Office Receipts ($)")
plt.title("IMDB Rating vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

plt.show()

In [None]:
# Do linear Regression and plot

x_values = movie_sample["IMDB Rating"]
y_values = movie_sample["Box Office"]
plt.xlabel("IMDB Ratings")
plt.ylabel("Box Office Receipts ($)")
plt.title("IMDB Rating vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values, marker = "o", facecolors="purple", edgecolors="black")
plt.plot(x_values,regress_values,"r-")


plt.xlabel("IMDB Rating (ratings from 0 to 10)")
plt.ylabel("Box Office Receipts")
plt.title("IMDB Rating vs Box Office Receipts")
plt.annotate(line_eq,(10,15),fontsize=15,color="red")


print(f"The r-value is: {rvalue**2}")


plt.show()


In [None]:
# Correlate Box Office v. IMDB Votes

x_values = movie_sample["IMDB Votes"]
y_values = movie_sample["Box Office"]
plt.xlabel("IMDB Votes")
plt.xticks(rotation=45)
plt.ylabel("Box Office Receipts ($)")
plt.title("IMBD Votes vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "o", facecolors = "purple", edgecolors="black")

plt.show()



In [None]:
# Do linear Regression and plot

x_values = movie_sample["IMDB Votes"]
y_values = movie_sample["Box Office"]
plt.xlabel("IMDB Votes")
plt.xticks(rotation=45)
plt.ylabel("Box Office Receipts ($)")
plt.title("IMBD Votes vs Box Office Sales")
plt.scatter(x_values, y_values, marker = "0", facecolors = "purple", edgecolors="black")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

plt.scatter(x_values, y_values, marker = "o", facecolors="purple", edgecolors="black")
plt.plot(x_values,regress_values,"r-")


plt.xlabel("IMDB Votes)")
plt.ylabel("Box Office Receipts")
plt.title("IMDB Votes vs Box Office Receipts")
plt.annotate(line_eq,(10,15),fontsize=15,color="red")


print(f"The r-value is: {rvalue**2}")


plt.show()

#### Conclusion/Segway
Conclusios with  with Awards/Nominations analysis
Segway linear regression to T-testing final variables.

## Steve's T-test (Chi Square? ANOVA?)
I looked at movie genres & MPAA ratings.  With genres we look at 
correlations across cataegories and thus performed a different analysis
that linear regression correlations...

In [None]:
# option: Steve performs t-test across movie genre 
# and Jim does both Awards & MPAA Rating correlations

#### Conclusion/Segway
Conclusions of genre and MPAA analysis
Segway to project conclusion

## Project Conclusion
In completing the box office vs. awards / nominations we had the following considerations....
and face certain issues in cleaning the data, etc. etc. ...