In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests

import wikipedia

In [6]:
movies = pd.read_csv('./data/final.csv')

movies.drop(columns=[
                   'region',
                   'isOriginalTitle',
#                    'originalTitle',
                   'endYear' 
                   ], inplace=True)

movies.rename(columns={'startYear':'release_year',
                     'runtimeMinutes':'runtime',
                     'averageRating':'rating', 
                     'numVotes':'votes',
                     'nconst':'cast_crew',
                     'primaryTitle':'primary_title',
                     'originalTitle':'original_title'
                     }, inplace=True)

In [7]:
movies.shape

(1634416, 19)

In [8]:
#removing the rows that lack critical information
movies = movies[movies['release_year'] != '\\N']
movies = movies[movies['rating'] != '\\N']
movies = movies[movies['runtime'] != '\\N']
movies = movies[movies['genres'] != '\\N']
movies = movies[movies['directors'] != '\\N']
movies = movies[movies['votes'] != '\\N']

#dropping nulls to try and reduce the size of movie pool
movies.dropna(inplace=True)

In [9]:
movies.shape

(1113625, 19)

In [10]:
#release year should be an int
movies['release_year'] = movies['release_year'].apply(lambda x: int(x))

#runtime should be an int
movies['runtime'] = movies['runtime'].apply(lambda x: int(x))

#votes should be an int, not float
movies['votes'] = movies['votes'].apply(lambda x: int(x))

#make year a string for combining
movies['release_year'] = movies['release_year'].apply(lambda x: str(x))

In [11]:
#creating a title for wikipedia scraping purposes
movies['wiki_title'] = movies['primary_title'] + " (" + movies['release_year'] + ' film)'

In [12]:
#need to break up genres into a list of the categories it falls into
movies['genres'] = movies['genres'].apply(lambda x: x.split(','))

#make year an int again
movies['release_year'] = movies['release_year'].apply(lambda x: int(x))

In [13]:
movies.dtypes

Unnamed: 0          int64
tconst             object
ordering_x          int64
title              object
primary_title      object
original_title     object
release_year        int64
runtime             int64
genres             object
directors          object
writers            object
rating            float64
votes               int64
ordering_y        float64
nconst_x           object
category           object
job                object
characters         object
nconst_y           object
wiki_title         object
dtype: object

In [14]:
movies = movies[movies['release_year']> 1940]

In [15]:
#restricting the dataframe to more well known movies
movies = movies[movies['runtime']>60]
movies = movies[movies['runtime']<240]

In [16]:
movies = movies.sort_values('votes', ascending=False).head(10000)

In [17]:
movies.shape

(10000, 20)

In [18]:
# test_list = list(scrape_test_data['wiki_title'])

NameError: name 'scrape_test_data' is not defined

In [None]:
# #just title
# test_list[0][:-12]
# #title + (film)
# test_list[0][:-10] + test_list[0][-5:]
# #title +(year film)
# test_list[0]

In [None]:
# plot_list = []
# for title in test_list:
#     try:
#         #just the title
#         plot_list.append((wikipedia.WikipediaPage(title[:-12]).section('Plot')))
#     except wikipedia.DisambiguationError:
#         try:
#             #title + (film)
#             plot_list.append((wikipedia.WikipediaPage(title[:-10] + title[-5:]).section('Plot')))
#         except wikipedia.DisambiguationError:
#             try:
#                 #title + (year film)
#                 plot_list.append((wikipedia.WikipediaPage(title).section('Plot')))
#             except:
#                 plot_list.append('ERROR')
            

In [None]:
from time import sleep

In [None]:
# #Using IMDB api to get plot synopses
# movies_partial = movies[:1000]

# plot_list = []

# for movie_id in movies_partial['tconst']:
#     sleep(.05)
#     url = "https://imdb8.p.rapidapi.com/title/get-synopses"
#     querystring = {"tconst":movie_id}
#     headers = {
#         'x-rapidapi-host': "imdb8.p.rapidapi.com",
#         'x-rapidapi-key': "d297a6e256msh4465e184912b816p1075cdjsndfa831fd16bb"
#         }
#     response = requests.request("GET", url, headers=headers, params=querystring)

#     plot_list.append(response.text)

# movies_partial['synopsis'] = plot_list
# movies_partial.to_csv('./data/movies_partial_1.csv')

using an IMDB alternative API for pulling data. This cell will not run since I have redacted the paid API key

In [None]:
import requests

output_list = []

for movie_id in movies['tconst']:

    url = "https://movie-database-imdb-alternative.p.rapidapi.com/"

    querystring = {"plot":"full","r":"json","i":movie_id}

    headers = {
        'x-rapidapi-host': "movie-database-imdb-alternative.p.rapidapi.com",
        'x-rapidapi-key': "-----------------------------------------------"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)

    output_list.append(response.text)

In [None]:
saving the data from the api to a variable.
movies['scraped_data'] = output_list

In [None]:
movies.to_csv('./data/movies_with_rapid_api_data.csv')

In [None]:
#getting the imdb id
a.split('\"imdbID\":\"')[1].split('\"')[0]

In [None]:
#creating empty lists to store relevant info from api calls as columns
id_list = []
tomato_score_list = []
metacritic_score_list = []
mpaa_rating_list = []
test = movies2

#parsing through the text blocks to pull out the relevant information for each variable
#needs try and except functions since some movies do not have all of the info. 
for movie_description in test['scraped_data']:
    #getting imdb ratings
    id_list.append(movie_description.split('\"imdbID\":\"')[1].split('\"')[0])
    #getting rotten tomatoes scores
    try:
        tomato_score_list.append(movie_description.split('\"Rotten Tomatoes\"')[1].split(',\"Value":')[1].split('}')[0].replace('\"', ''))    
    except: tomato_score_list.append(np.nan)
    #getting metacritic scores
    try:
        metacritic_score_list.append(movie_description.split('\"Metacritic"')[1].split(',\"Value":')[1].split('}')[0].replace('\"', ''))
    except: metacritic_score_list.append(np.nan)
    #getting the MPAA rating for the film
    mpaa_rating_list.append(movie_description.split('\"Rated\":')[1].split(',\"Released\"')[0].replace('\"', ''))

#adding the variables to the dataframe
movies2['tconst'] = id_list
movies2['tomato_score'] = tomato_score_list
movies2['metacritic_score'] = metacritic_score_list
movies2['mpaa_rating'] = mpaa_rating_list

In [None]:
movies2.to_csv('./data/for_wiki_scraping.csv')