# Gathering - Rotten Tomatoes

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [2]:
# Import the Rotten Tomatoes bestofrt TSV file into a DataFrame
df = pd.read_csv('bestofrt.tsv',sep='\t')

In [3]:
# Check whether the file is read correctly
df.head()

Unnamed: 0,ranking,critic_score,title,released_year,number_of_critic_ratings
0,1,96,Black Panther,2018,519
1,2,99,Citizen Kane,1941,115
2,3,98,Parasite (Gisaengchung),2019,458
3,4,94,Avengers: Endgame,2019,538
4,5,99,Casablanca,1942,121


In [4]:
# Get response from 'url' to process
url = 'https://www.rottentomatoes.com/top/bestofrt/'
response = requests.get(url)

In [5]:
## download as file
#with open("main_page.html",mode = 'wb') as file:
#    file.write(response.content)

In [6]:
# Access directly from web
soup = BeautifulSoup(response.content,'lxml')
# Fetch top 100 movie list
rows = soup.find('table',class_='table').find_all('tr')
# or find_all('tr')
rows.pop(0)
rows

[<tr>
 <td class="bold">1.</td>
 <td>
 <span class="tMeterIcon tiny">
 <span class="icon tiny certified_fresh"></span>
 <span class="tMeterScore"> 96%</span>
 </span>
 </td>
 <td>
 <a class="unstyled articleLink" href="/m/black_panther_2018">
             Black Panther (2018)</a>
 </td>
 <td class="right hidden-xs">519</td>
 </tr>,
 <tr>
 <td class="bold">2.</td>
 <td>
 <span class="tMeterIcon tiny">
 <span class="icon tiny certified_fresh"></span>
 <span class="tMeterScore"> 99%</span>
 </span>
 </td>
 <td>
 <a class="unstyled articleLink" href="/m/citizen_kane">
             Citizen Kane (1941)</a>
 </td>
 <td class="right hidden-xs">115</td>
 </tr>,
 <tr>
 <td class="bold">3.</td>
 <td>
 <span class="tMeterIcon tiny">
 <span class="icon tiny certified_fresh"></span>
 <span class="tMeterScore"> 98%</span>
 </span>
 </td>
 <td>
 <a class="unstyled articleLink" href="/m/parasite_2019">
             Parasite (Gisaengchung) (2019)</a>
 </td>
 <td class="right hidden-xs">458</td>
 </tr>,


In [7]:
#Link for each movie page 
movie_link = []
for row in rows:    
    movie_link.append('https://www.rottentomatoes.com'+row.a['href'])

In [8]:
# For testing how the value is returned
aud = BeautifulSoup(requests.get(movie_link[0]).content,'lxml').find('score-board')
aud

<score-board audiencescore="79" audiencestate="upright" class="scoreboard" data-qa="score-panel" hidden="" rating="PG-13" tomatometerscore="96" tomatometerstate="certified-fresh">
<h1 class="scoreboard__title" data-qa="score-panel-movie-title" slot="title">Black Panther</h1>
<p class="scoreboard__info" slot="info">2018, Adventure/Action, 2h 14m</p>
<a class="scoreboard__link scoreboard__link--tomatometer" data-qa="tomatometer-review-count" href="/m/black_panther_2018/reviews?intcmp=rt-scorecard_tomatometer-reviews" slot="critics-count">519 Reviews</a>
<a class="scoreboard__link scoreboard__link--audience" data-qa="audience-rating-count" href="/m/black_panther_2018/reviews?type=user&amp;intcmp=rt-scorecard_audience-score-reviews" slot="audience-count">50,000+ Ratings</a>
</score-board>

In [9]:
#create a Audience rating info table using html for individual movie
aud_list = []
for rank in range (1,len(df) + 1):
    values = BeautifulSoup(requests.get(movie_link[rank-1]).content,'lxml').find('score-board')
    #Append list to dict
    aud_list.append({
        'title' : df['title'][rank-1],
        'audience_score': int(values['audiencescore']),
        'number_of_audience_ratings': values.find_all('a')[1].string
    })
    
aud_list

[{'title': 'Black Panther',
  'audience_score': 79,
  'number_of_audience_ratings': '50,000+ Ratings'},
 {'title': 'Citizen Kane',
  'audience_score': 90,
  'number_of_audience_ratings': '100,000+ Ratings'},
 {'title': 'Parasite (Gisaengchung)',
  'audience_score': 90,
  'number_of_audience_ratings': '5,000+ Ratings'},
 {'title': 'Avengers: Endgame',
  'audience_score': 90,
  'number_of_audience_ratings': '50,000+ Ratings'},
 {'title': 'Casablanca',
  'audience_score': 95,
  'number_of_audience_ratings': '250,000+ Ratings'},
 {'title': 'Knives Out',
  'audience_score': 92,
  'number_of_audience_ratings': '25,000+ Ratings'},
 {'title': 'Us',
  'audience_score': 59,
  'number_of_audience_ratings': '10,000+ Ratings'},
 {'title': 'Toy Story 4',
  'audience_score': 94,
  'number_of_audience_ratings': '50,000+ Ratings'},
 {'title': 'Lady Bird',
  'audience_score': 79,
  'number_of_audience_ratings': '10,000+ Ratings'},
 {'title': 'Mission: Impossible - Fallout',
  'audience_score': 88,
  'nu

In [10]:
df_aud = pd.DataFrame(aud_list, columns = ['title', 'audience_score', 'number_of_audience_ratings'])
df_aud

Unnamed: 0,title,audience_score,number_of_audience_ratings
0,Black Panther,79,"50,000+ Ratings"
1,Citizen Kane,90,"100,000+ Ratings"
2,Parasite (Gisaengchung),90,"5,000+ Ratings"
3,Avengers: Endgame,90,"50,000+ Ratings"
4,Casablanca,95,"250,000+ Ratings"
...,...,...,...
95,Jaws,82,"10,000+ Ratings"
96,Shazam!,90,"250,000+ Ratings"
97,"The Godfather, Part II",97,"250,000+ Ratings"
98,The Dark Knight,94,"250,000+ Ratings"


In [11]:
soup.find('table',class_='table').find_all('tr')

[<tr>
 <th>Rank</th>
 <th><span class="hidden-xs">Rating</span><span class="visible-xs">Tomatometer</span></th>
 <th>Title</th>
 <th class="right hidden-xs">No. of Reviews</th>
 </tr>,
 <tr>
 <td class="bold">1.</td>
 <td>
 <span class="tMeterIcon tiny">
 <span class="icon tiny certified_fresh"></span>
 <span class="tMeterScore"> 96%</span>
 </span>
 </td>
 <td>
 <a class="unstyled articleLink" href="/m/black_panther_2018">
             Black Panther (2018)</a>
 </td>
 <td class="right hidden-xs">519</td>
 </tr>,
 <tr>
 <td class="bold">2.</td>
 <td>
 <span class="tMeterIcon tiny">
 <span class="icon tiny certified_fresh"></span>
 <span class="tMeterScore"> 99%</span>
 </span>
 </td>
 <td>
 <a class="unstyled articleLink" href="/m/citizen_kane">
             Citizen Kane (1941)</a>
 </td>
 <td class="right hidden-xs">115</td>
 </tr>,
 <tr>
 <td class="bold">3.</td>
 <td>
 <span class="tMeterIcon tiny">
 <span class="icon tiny certified_fresh"></span>
 <span class="tMeterScore"> 98%</sp

In [12]:
df = pd.merge(df,df_aud,on='title')
df

Unnamed: 0,ranking,critic_score,title,released_year,number_of_critic_ratings,audience_score,number_of_audience_ratings
0,1,96,Black Panther,2018,519,79,"50,000+ Ratings"
1,2,99,Citizen Kane,1941,115,90,"100,000+ Ratings"
2,3,98,Parasite (Gisaengchung),2019,458,90,"5,000+ Ratings"
3,4,94,Avengers: Endgame,2019,538,90,"50,000+ Ratings"
4,5,99,Casablanca,1942,121,95,"250,000+ Ratings"
...,...,...,...,...,...,...,...
95,96,98,Jaws,1975,90,82,"10,000+ Ratings"
96,97,90,Shazam!,2019,409,90,"250,000+ Ratings"
97,98,98,"The Godfather, Part II",1974,85,97,"250,000+ Ratings"
98,99,94,The Dark Knight,2008,344,94,"250,000+ Ratings"


# Downloading files from internet

In [13]:
# Make directory if it doesn't already exist
folder_name = 'ebert_reviews'
folder_path = os.path.join(os.getcwd(),folder_name)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [14]:
ebert_review_urls = ['https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9900_1-the-wizard-of-oz-1939-film/1-the-wizard-of-oz-1939-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9901_2-citizen-kane/2-citizen-kane.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9901_3-the-third-man/3-the-third-man.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_4-get-out-film/4-get-out-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_5-mad-max-fury-road/5-mad-max-fury-road.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_6-the-cabinet-of-dr.-caligari/6-the-cabinet-of-dr.-caligari.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_7-all-about-eve/7-all-about-eve.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_8-inside-out-2015-film/8-inside-out-2015-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_9-the-godfather/9-the-godfather.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_10-metropolis-1927-film/10-metropolis-1927-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_11-e.t.-the-extra-terrestrial/11-e.t.-the-extra-terrestrial.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_12-modern-times-film/12-modern-times-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_14-singin-in-the-rain/14-singin-in-the-rain.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_15-boyhood-film/15-boyhood-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_16-casablanca-film/16-casablanca-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_17-moonlight-2016-film/17-moonlight-2016-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_18-psycho-1960-film/18-psycho-1960-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_19-laura-1944-film/19-laura-1944-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_20-nosferatu/20-nosferatu.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_21-snow-white-and-the-seven-dwarfs-1937-film/21-snow-white-and-the-seven-dwarfs-1937-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_22-a-hard-day27s-night-film/22-a-hard-day27s-night-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_23-la-grande-illusion/23-la-grande-illusion.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_25-the-battle-of-algiers/25-the-battle-of-algiers.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_26-dunkirk-2017-film/26-dunkirk-2017-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_27-the-maltese-falcon-1941-film/27-the-maltese-falcon-1941-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_29-12-years-a-slave-film/29-12-years-a-slave-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_30-gravity-2013-film/30-gravity-2013-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_31-sunset-boulevard-film/31-sunset-boulevard-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_32-king-kong-1933-film/32-king-kong-1933-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_33-spotlight-film/33-spotlight-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_34-the-adventures-of-robin-hood/34-the-adventures-of-robin-hood.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_35-rashomon/35-rashomon.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_36-rear-window/36-rear-window.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_37-selma-film/37-selma-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_38-taxi-driver/38-taxi-driver.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_39-toy-story-3/39-toy-story-3.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_40-argo-2012-film/40-argo-2012-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_41-toy-story-2/41-toy-story-2.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_42-the-big-sick/42-the-big-sick.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_43-bride-of-frankenstein/43-bride-of-frankenstein.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_44-zootopia/44-zootopia.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_45-m-1931-film/45-m-1931-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_46-wonder-woman-2017-film/46-wonder-woman-2017-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_48-alien-film/48-alien-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_49-bicycle-thieves/49-bicycle-thieves.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_50-seven-samurai/50-seven-samurai.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_51-the-treasure-of-the-sierra-madre-film/51-the-treasure-of-the-sierra-madre-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_52-up-2009-film/52-up-2009-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_53-12-angry-men-1957-film/53-12-angry-men-1957-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_54-the-400-blows/54-the-400-blows.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9911_55-logan-film/55-logan-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9911_57-army-of-shadows/57-army-of-shadows.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9912_58-arrival-film/58-arrival-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9912_59-baby-driver/59-baby-driver.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_60-a-streetcar-named-desire-1951-film/60-a-streetcar-named-desire-1951-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_61-the-night-of-the-hunter-film/61-the-night-of-the-hunter-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_62-star-wars-the-force-awakens/62-star-wars-the-force-awakens.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_63-manchester-by-the-sea-film/63-manchester-by-the-sea-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_64-dr.-strangelove/64-dr.-strangelove.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_66-vertigo-film/66-vertigo-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_67-the-dark-knight-film/67-the-dark-knight-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_68-touch-of-evil/68-touch-of-evil.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_69-the-babadook/69-the-babadook.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_72-rosemary27s-baby-film/72-rosemary27s-baby-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9916_73-finding-nemo/73-finding-nemo.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9916_74-brooklyn-film/74-brooklyn-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9917_75-the-wrestler-2008-film/75-the-wrestler-2008-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9917_77-l.a.-confidential-film/77-l.a.-confidential-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_78-gone-with-the-wind-film/78-gone-with-the-wind-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_79-the-good-the-bad-and-the-ugly/79-the-good-the-bad-and-the-ugly.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_80-skyfall/80-skyfall.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_82-tokyo-story/82-tokyo-story.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_83-hell-or-high-water-film/83-hell-or-high-water-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_84-pinocchio-1940-film/84-pinocchio-1940-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_85-the-jungle-book-2016-film/85-the-jungle-book-2016-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991a_86-la-la-land-film/86-la-la-land-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991b_87-star-trek-film/87-star-trek-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991b_89-apocalypse-now/89-apocalypse-now.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_90-on-the-waterfront/90-on-the-waterfront.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_91-the-wages-of-fear/91-the-wages-of-fear.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_92-the-last-picture-show/92-the-last-picture-show.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_93-harry-potter-and-the-deathly-hallows-part-2/93-harry-potter-and-the-deathly-hallows-part-2.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_94-the-grapes-of-wrath-film/94-the-grapes-of-wrath-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_96-man-on-wire/96-man-on-wire.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_97-jaws-film/97-jaws-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_98-toy-story/98-toy-story.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_99-the-godfather-part-ii/99-the-godfather-part-ii.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_100-battleship-potemkin/100-battleship-potemkin.txt']

In [15]:
for url in ebert_review_urls:
    response = requests.get(url)
    with open(folder_name+'/'+url.split('/')[-1],mode = 'wb') as file:
        file.write(response.content)

In [16]:
len(os.listdir(folder_path))

88

# Text files in Python

In [17]:
# For reading files instead of os.path
import glob

In [18]:
review_list = []
for ebert_review in glob.glob(folder_name+'/*.txt'):
    with open(ebert_review, encoding='utf-8') as file:
        title = file.readline()[:-1]
        review_url = file.readline()[:-1]
        review_text = file.read()
        review_list.append({
            'title' : title,
            'review_url' : review_url,
            'review_text' : review_text
        })

df_review = pd.DataFrame(review_list, columns = ['title','review_url','review_text'])
df_review

Unnamed: 0,title,review_url,review_text
0,The Wizard of Oz (1939),http://www.rogerebert.com/reviews/great-movie-...,As a child I simply did not notice whether a m...
1,Metropolis (1927),http://www.rogerebert.com/reviews/great-movie-...,The opening shots of the restored “Metropolis”...
2,Battleship Potemkin (1925),http://www.rogerebert.com/reviews/great-movie-...,"""The Battleship Potemkin” has been so famous f..."
3,E.T. The Extra-Terrestrial (1982),http://www.rogerebert.com/reviews/great-movie-...,Dear Raven and Emil:\n\nSunday we sat on the b...
4,Modern Times (1936),http://www.rogerebert.com/reviews/modern-times...,"A lot of movies are said to be timeless, but s..."
...,...,...,...
83,The Grapes of Wrath (1940),http://www.rogerebert.com/reviews/great-movie-...,"John Ford's ""The Grapes of Wrath"" is a left-wi..."
84,Man on Wire (2008),http://www.rogerebert.com/reviews/man-on-wire-...,I am afraid of heights. Now you know. That is ...
85,Jaws (1975),http://www.rogerebert.com/reviews/great-movie-...,"""You're going to need a bigger boat.""\n\nSo th..."
86,Toy Story (1995),http://www.rogerebert.com/reviews/toy-story-1995,"""Toy Story"" creates a universe out of a couple..."


In [185]:
# For testing purpose
#df_solution = pd.read_pickle('df_solution.pkl')
#df.sort_values('title', inplace = True)
#df.reset_index(inplace = True, drop = True)
#df_solution.sort_values('title', inplace = True)
#df_solution.reset_index(inplace = True, drop = True)
#pd.testing.assert_frame_equal(df, df_solution)

# API in Python for wikipedia

In [19]:
# Access library for Wikipedia API
import wptools

In [20]:
page = wptools.page('E.T._the_Extra-Terrestrial')
page.get()

en.wikipedia.org (query) E.T._the_Extra-Terrestrial
en.wikipedia.org (query) E.T. the Extra-Terrestrial (&plcontinue=...
en.wikipedia.org (parse) 73441
www.wikidata.org (wikidata) Q11621
www.wikidata.org (labels) Q464522|Q130232|Q506198|P1981|P910|P263...
www.wikidata.org (labels) Q139184|P3803|Q1315008|Q830079|Q1057027...
www.wikidata.org (labels) P2346|Q723685|Q1757366|Q8555|Q1044183|P...
www.wikidata.org (labels) P3593|Q258064|Q56887384|Q20644795|Q2490...
www.wikidata.org (labels) P136|Q471839|P1237|P1258|Q103360|P2130|...
en.wikipedia.org (restbase) /page/summary/E.T._the_Extra-Terrestrial
en.wikipedia.org (imageinfo) File:ET logo 3.svg|File:E t the extr...
E.T. the Extra-Terrestrial (en) data
{
  aliases: <list(2)> E.T., ET
  assessments: <dict(4)> United States, Film, Science Fiction, Lib...
  claims: <dict(120)> P1562, P57, P272, P345, P31, P161, P373, P48...
  description: 1982 film by Steven Spielberg
  exhtml: <str(370)> <p><i><b>E.T. the Extra-Terrestrial</b></i> i...
  exre

<wptools.page.WPToolsPage at 0x2904eda7580>

In [21]:
page.data['image'][0]['url']

'https://upload.wikimedia.org/wikipedia/en/6/66/E_t_the_extra_terrestrial_ver3.jpg'

In [22]:
# Create title list for top 100 movies to download the images from wiki
title_list = [
    'Black_Panther_(film)',
    'Citizen_Kane',
    'Parasite_(2019_film)',
    'Avengers:_Endgame',
    'Casablanca_(film)',
    'Knives_Out',
    'Us_(2019_film)',
    'Toy_Story_4',
    'Lady_Bird_(film)',
    'Mission:_Impossible_–_Fallout',
    'BlacKkKlansman',
    'The_Wizard_of_Oz_(1939_film)',
    'Get_Out',
    'The_Irishman',
    'Mad_Max:_Fury_Road',
    'Spider-Man:_Into_the_Spider-Verse',
    'Moonlight_(2016_film)',
    'A_Star_Is_Born_(2018_film)',
    'Wonder_Woman_(2017_film)',
    'Inside_Out_(2015_film)',
    'The_Farewell_(2019_film)',
    'A_Quiet_Place_(film)',
    'Eighth_Grade_(film)',
    'Modern_Times_(film)',
    'Booksmart',
    'Roma_(2018_film)',
    'Portrait_of_a_Lady_on_Fire',
    'Dunkirk_(2017_film)',
    'Coco_(2017_film)',
    'A_Night_at_the_Opera_(film)',
    'Selma_(film)',
    'The_Godfather',
    'Spotlight_(film)',
    'It_Happened_One_Night',
    'La_Grande_Illusion',
    'The_Shape_of_Water',
    'Thor:_Ragnarok',
    'The_Third_Man',
    'The_Cabinet_of_Dr._Caligari',
    'Arrival_(film)',
    "Singin%27_in_the_Rain",
    'Logan_(film)',
    'The_Favourite',
    'Double_Indemnity',
    'Marriage_Story',
    'Snow_White_and_the_Seven_Dwarfs_(1937_film)',
    'E.T._the_Extra-Terrestrial',
    'The_Big_Sick',
    'Star_Wars:_The_Last_Jedi',
    'Star_Wars:_The_Force_Awakens',
    'All_About_Eve',
    'The_Kid_(1921_film)',
    'Boyhood_(2014_film)',
    'Paddington_2',
    'The_Adventures_of_Robin_Hood',
    'Once_Upon_a_Time_in_Hollywood',
    '12_Years_a_Slave_(film)',
    'Manchester_by_the_Sea_(film)',
    'Argo_(2012_film)',
    'Leave_No_Trace_(film)',
    'Nosferatu',
    'King_Kong_(1933_film)',
    'La_La_Land',
    'Spider-Man:_Far_From_Home',
    '1917_(2019_film)',
    'Alien_(film)',
    'Incredibles_2',
    'Laura_(1944_film)',
    'Zootopia',
    'Call_Me_by_Your_Name_(film)',
    'Psycho_(1960_film)',
    'Shadow_of_a_Doubt',
    'The_Invisible_Man_(2020_film)',
    'Sunset_Boulevard_(film)',
    'Gravity_(2013_film)',
    'The_Florida_Project',
    'The_Maltese_Falcon_(1941_film)',
    'War_for_the_Planet_of_the_Apes',
    'Widows_(2018_film)',
    "A_Hard_Day%27s_Night_(film)",
    'The_Battle_of_Algiers',
    'Rebecca_(1940_film)',
    'Spider-Man:_Homecoming',
    'Baby_Driver',
    'The_Philadelphia_Story_(film)',
    'Never_Rarely_Sometimes_Always',
    'Top_Hat',
    'Seven_Samurai',
    'North_by_Northwest',
    'Pain_and_Glory',
    'Shoplifters_(film)',
    'Metropolis_(1927_film)',
    'M_(1931_film)',
    'Hell_or_High_Water_(film)',
    'Up_(2009_film)',
    'Jaws_(film)',
    'Shazam!_(film)',
    'The_Godfather_Part_II',
    'The_Dark_Knight_(film)',
    'Won%27t_You_Be_My_Neighbor%3F_(film)'
]

In [23]:
len(title_list)

100

In [24]:
# To download images
from PIL import Image
from io import BytesIO

In [26]:
# Make directory if it doesn't already exist
img_folder_name = 'bestofrt_posters'
img_folder_path = os.path.join(os.getcwd(),img_folder_name)
if not os.path.exists(img_folder_path):
    os.makedirs(img_folder_path)

#### Note: the cell below, if correctly implemented, will likely take ~5 minutes to run.

In [27]:
# List of dictionaries to build and convert to a DataFrame later
images_list = []
image_errors = {}
for title in title_list:
    try:
        # This cell is slow so print ranking to gauge time remaining
        ranking = title_list.index(title) + 1
        print(ranking)
        page = wptools.page(title,silent = True)
        images = page.get().data['image']
        # First image is usually the poster
        first_image_url = images[0]['url']
        response = requests.get(first_image_url)
        # Download movie poster image
        image = Image.open(BytesIO(response.content))
        image_file_format = first_image_url.split('.')[-1]
        image.save(img_folder_name + "/" + str(ranking) + "_" + title + '.' + image_file_format)
        # Append to list of dictionaries
        images_list.append({'ranking': int(ranking),
                        'wiki_api_title': title,
                        'poster_url': first_image_url})
        
    # Not best practice to catch all exceptions but fine for this short script
    except Exception as e:
        print(str(ranking) + "_" + title + ": " + str(e))
        image_errors[str(ranking) + "_" + title] = images
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
35_La_Grande_Illusion: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
36
36_The_Shape_of_Water: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
37
37_Thor:_Ragnarok: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
38
38_The_Third_Man: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
39
39_The_Cabinet_of_Dr._Caligari: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
40
40_Arrival_(film): (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
41
41_Singin%27_in_the_Rain: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
42
42_Logan_(film): (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
43
43_The_Favourite: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
44
44_Double_Indemnity: (28, 'Failed to connect to en.wikipedia.org port 443: Timed out')
4

API error: {'code': 'invalidtitle', 'info': 'Bad title "A_Hard_Day%27s_Night_(film)".', 'docref': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}


80_A_Hard_Day%27s_Night_(film): https://en.wikipedia.org/w/api.php?action=parse&formatversion=2&contentmodel=text&disableeditsection=&disablelimitreport=&disabletoc=&prop=text|iwlinks|parsetree|wikitext|displaytitle|properties&redirects&page=A_Hard_Day%2527s_Night_%28film%29
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
95_Up_(2009_film): (56, 'Send failure: Connection was reset')
96
96_Jaws_(film): (6, 'Could not resolve host: en.wikipedia.org')
97
97_Shazam!_(film): (6, 'Could not resolve host: en.wikipedia.org')
98
98_The_Godfather_Part_II: (6, 'Could not resolve host: en.wikipedia.org')
99
99_The_Dark_Knight_(film): (6, 'Could not resolve host: en.wikipedia.org')
100
100_Won%27t_You_Be_My_Neighbor%3F_(film): (6, 'Could not resolve host: en.wikipedia.org')


One you have completed the above code requirements, read and run the three cells below and interpret their output.

In [30]:
for title in range(34,36):
    try:
        # This cell is slow so print ranking to gauge time remaining
        ranking = title + 1
        print(ranking)
        page = wptools.page(title_list[title],silent = True)
        images = page.get().data['image']
        # First image is usually the poster
        first_image_url = images[0]['url']
        response = requests.get(first_image_url)
        # Download movie poster image
        image = Image.open(BytesIO(response.content))
        image_file_format = first_image_url.split('.')[-1]
        image.save(img_folder_name + "/" + str(ranking) + "_" + title_list[title] + '.' + image_file_format)
        # Append to list of dictionaries
        images_list.append({'ranking': int(ranking),
                        'wiki_api_title': title_list[title],
                        'poster_url': first_image_url})
        
    # Not best practice to catch all exceptions but fine for this short script
    except Exception as e:
        print(str(ranking) + "_" + title_list[title] + ": " + str(e))
        image_errors[str(ranking) + "_" + title_list[title]] = images

35
36


In [31]:
for key in image_errors.keys():
    print(key)

35_La_Grande_Illusion
36_The_Shape_of_Water
37_Thor:_Ragnarok
38_The_Third_Man
39_The_Cabinet_of_Dr._Caligari
40_Arrival_(film)
41_Singin%27_in_the_Rain
42_Logan_(film)
43_The_Favourite
44_Double_Indemnity
45_Marriage_Story
46_Snow_White_and_the_Seven_Dwarfs_(1937_film)
47_E.T._the_Extra-Terrestrial
80_A_Hard_Day%27s_Night_(film)
95_Up_(2009_film)
96_Jaws_(film)
97_Shazam!_(film)
98_The_Godfather_Part_II
99_The_Dark_Knight_(film)
100_Won%27t_You_Be_My_Neighbor%3F_(film)


In [None]:
# Inspect unidentifiable images and download them individually
for rank_title, images in image_errors.items():
    if rank_title == '22_A_Hard_Day%27s_Night_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/4/47/A_Hard_Days_night_movieposter.jpg'
    if rank_title == '53_12_Angry_Men_(1957_film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/9/91/12_angry_men.jpg'
    if rank_title == '72_Rosemary%27s_Baby_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/e/ef/Rosemarys_baby_poster.jpg'
    if rank_title == '93_Harry_Potter_and_the_Deathly_Hallows_–_Part_2':
        url = 'https://upload.wikimedia.org/wikipedia/en/d/df/Harry_Potter_and_the_Deathly_Hallows_%E2%80%93_Part_2.jpg'
    title = rank_title[3:]
    df_list.append({'ranking': int(title_list.index(title) + 1),
                    'wiki_api_title': title,
                    'poster_url': url})
    r = requests.get(url)
    # Download movie poster image
    i = Image.open(BytesIO(r.content))
    image_file_format = url.split('.')[-1]
    i.save(folder_name + "/" + rank_title + '.' + image_file_format)

In [None]:
# Create DataFrame from list of dictionaries
df_img = pd.DataFrame(df_list, columns = ['ranking', 'wiki_api_title', 'poster_url'])
df_img = df_img.sort_values('ranking').reset_index(drop=True)
df_img