# Project 2: Web Scraping and API access

In [3]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


## Part 1: Explore the html for Wikipedia articles. 

### A. Using inspect element, copy the html code for a table.

### B. Using inspect element, find the html syntax for a link. 

### C. Using inspect element, find the html syntax for linking an image

## Part 2: Explore one Wikipedia page with the beautifulsoup package

In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [5]:
#save and print the text content of a page with all tags removed
def save_page_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    text = ' '.join(text.split())  # Remove all spaces
    return text

print(save_page_text('https://en.wikipedia.org/wiki/Pizza'))


Pizza - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us Contribute HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages Search Search Appearance Donate Create account Log in Personal tools Donate Create account Log in Pages for logged out editors learn more ContributionsTalk Contents move to sidebar hide (Top) 1 Etymology 2 History 3 Preparation Toggle Preparation subsection 3.1 Baking 3.2 Crust 3.3 Cheese 4 Varieties and styles Toggle Varieties and styles subsection 4.1 Varieties 4.2 Styles 4.3 By region of origin 4.3.1 Italy 4.3.2 United States 4.3.3 Argentina 4.4 Dessert pizza 5 Nutrition 6 Similar dishes 7 See also 8 Notes 9 References 10 Further reading Toggle the table of contents Pizza 143 languages AfrikaansAlemannischአማርኛÆngliscالعربيةAragonésԱրեւմտահայերէնAsturianuAzərbaycancaتۆرکجهBasa BaliবাংলাBanjar閩南語 / Bân-lâm-gúБашҡортсаБеларускаяБеларуская (тарашкевіц

In [6]:
#download an image with beautifulsoup and save it in this repository
source=requests.get('https://en.wikipedia.org/wiki/Pizza').text
soup=BeautifulSoup(source)

img_tags=soup.find_all('img')


img_url="https:" + img_tags[10]['src']
print(img_url)

img=requests.get(img_url)
with open('image.jpg','wb') as f:
    f.write(img.content)

https://upload.wikimedia.org/wikipedia/commons/thumb/2/2d/Neapolitan_pizza.jpg/305px-Neapolitan_pizza.jpg


In [6]:
#find all the links in a page with beautifulsoup
#print the first 100 characters of ten of these links
def find_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a')
    for link in links[:10]:
        print(link.get('href')[:100])
find_links("https://en.wikipedia.org/wiki/Pizza")

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal


## Part 3: Downloading scripts

In [7]:
scripts_df=pd.read_csv('pudding_data.csv')

In [8]:
scripts_df

Unnamed: 0,imdb_id,script_id,title,year,gross (inflation-adjusted),link
0,tt0019777,4031,The Cocoanuts,1929,,http://www.pages.drexel.edu/~ina22/splaylib/Sc...
1,tt0021884,8521,Frankenstein,1931,298.0,Frankenstein (Florey & Fort) [1931-5-23] [Scan...
2,tt0022054,1086,The Last Flight,1931,,"film_20100519/all_imsdb_05_19_10/Last-Flight,-..."
3,tt0022626,1631,American Madness,1932,,http://www.imsdb.com/Movie Scripts/American Ma...
4,tt0022958,2438,Grand Hotel,1932,,http://www.imsdb.com/Movie Scripts/Grand Hotel...
...,...,...,...,...,...,...
1995,tt3733778,8533,Pay the Ghost,2015,,"Pay The Ghost (Dan Kay, 9-1-09).pdf"
1996,tt3808342,5499,Son of Saul,2015,0.0,http://gointothestory.blcklst.com/wp-content/u...
1997,tt3850214,8056,Dope,2015,18.0,Dope (2013.10.31) [Digital].pdf
1998,tt3859076,5507,Truth,2015,2.0,http://gointothestory.blcklst.com/wp-content/u...


In [11]:
# web_data is a dataframe and you want to add it to scripts_df. 
import requests

scripts_list = scripts_df['link']  # Process all URLs
web_data = []

for url in scripts_list:
    try:
        web_data.append(requests.get(url, timeout=5).text)
    except:
        web_data.append("NULL")

scripts_df['web_data'] = web_data  # Update the entire DataFrame
print(scripts_df)

        imdb_id  script_id             title  year  \
0     tt0019777       4031     The Cocoanuts  1929   
1     tt0021884       8521      Frankenstein  1931   
2     tt0022054       1086   The Last Flight  1931   
3     tt0022626       1631  American Madness  1932   
4     tt0022958       2438       Grand Hotel  1932   
...         ...        ...               ...   ...   
1995  tt3733778       8533     Pay the Ghost  2015   
1996  tt3808342       5499       Son of Saul  2015   
1997  tt3850214       8056              Dope  2015   
1998  tt3859076       5507             Truth  2015   
1999  tt4270516       5410           Grandma  2015   

      gross (inflation-adjusted)  \
0                            NaN   
1                          298.0   
2                            NaN   
3                            NaN   
4                            NaN   
...                          ...   
1995                         NaN   
1996                         0.0   
1997                       

In [12]:
#using the links in the "link" column, download the first 1000 characters of each script
#use requests and bs4, remember to remove all html tags

import requests
from bs4 import BeautifulSoup

scripts_list = scripts_df['link']  # Process all URLs
web_data = []

for url in scripts_list:
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        text_snippet = soup.get_text()[:1000]  # Remove HTML and get first 1000 characters
        web_data.append(text_snippet)
    except:
        web_data.append("NULL")

scripts_df['web_data'] = web_data  # Update the entire DataFrame
print(scripts_df)


        imdb_id  script_id             title  year  \
0     tt0019777       4031     The Cocoanuts  1929   
1     tt0021884       8521      Frankenstein  1931   
2     tt0022054       1086   The Last Flight  1931   
3     tt0022626       1631  American Madness  1932   
4     tt0022958       2438       Grand Hotel  1932   
...         ...        ...               ...   ...   
1995  tt3733778       8533     Pay the Ghost  2015   
1996  tt3808342       5499       Son of Saul  2015   
1997  tt3850214       8056              Dope  2015   
1998  tt3859076       5507             Truth  2015   
1999  tt4270516       5410           Grandma  2015   

      gross (inflation-adjusted)  \
0                            NaN   
1                          298.0   
2                            NaN   
3                            NaN   
4                            NaN   
...                          ...   
1995                         NaN   
1996                         0.0   
1997                       

In [14]:
#add a new column to the df with the text downloaded
#save this new dataframe as "pudding_texts.csv"

scripts_list = scripts_df['link']
web_data = []

for url in scripts_list:
    try:
        web_data.append(requests.get(url, timeout=5).text)
    except:
        web_data.append("NULL")

scripts_df['web_data'] = web_data  # Add new column with downloaded text

scripts_df.to_csv("pudding_texts.csv", index=False)  # Save as CSV

print("Saved as pudding_texts.csv")



Saved as pudding_texts.csv


## Part 4: TMDB database

#### Browse the documentation at https://developer.themoviedb.org/reference/intro/getting-started. Create an account to authenticate

In [3]:
#create a dataset of the movies in theaters now. Include metadata fields you are interested in. 
import pandas as pd
import os

# TMDB API key
API_KEY = "f0694d916f8c06b27b42fb93e2ccfd37"
BASE_URL = "https://api.themoviedb.org/3"

# Fetch movies in theaters
def get_now_playing():
    url = f"{BASE_URL}/movie/now_playing?api_key={API_KEY}&language=en-US&page=1"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        movies = data.get('results', [])
        
        movie_data = []
        for movie in movies:
            movie_data.append({
                "id": movie.get("id"),
                "title": movie.get("title"),
                "release_date": movie.get("release_date"),
                "overview": movie.get("overview"),
                "poster_path": f"https://image.tmdb.org/t/p/w500{movie.get('poster_path')}"
            })
        
        return pd.DataFrame(movie_data)
    else:
        print("Failed to retrieve movies:", response.status_code)
        return pd.DataFrame()

movies_df = get_now_playing()

# Ensure the dataframe is not empty before proceeding
if not movies_df.empty:
    movies_csv_path = "now_playing_movies.csv"  # Save in the current directory
    movies_df.to_csv(movies_csv_path, index=False)
    
    # Display the first few rows to confirm success
    print(movies_df.head())
else:
    print("No movies retrieved, skipping CSV save.")


        id                             title release_date  \
0   950396                         The Gorge   2025-02-13   
1  1126166                       Flight Risk   2025-01-22   
2   939243              Sonic the Hedgehog 3   2024-12-19   
3   822119  Captain America: Brave New World   2025-02-12   
4  1084199                         Companion   2025-01-22   

                                            overview  \
0  Two highly trained operatives grow close from ...   
1  A U.S. Marshal escorts a government witness to...   
2  Sonic, Knuckles, and Tails reunite against a p...   
3  After meeting with newly elected U.S. Presiden...   
4  During a weekend getaway at a secluded lakesid...   

                                         poster_path  
0  https://image.tmdb.org/t/p/w500/7iMBZzVZtG0oBu...  
1  https://image.tmdb.org/t/p/w500/q0bCG4NX32iIEs...  
2  https://image.tmdb.org/t/p/w500/d8Ryb8AunYAuyc...  
3  https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...  
4  https://image.tmdb

In [6]:
#download the movie posters for 10 of these movies and save them to this repository

import requests
import os

# Create a directory to save the posters
poster_dir = "movie_posters"
os.makedirs(poster_dir, exist_ok=True)

# Function to download and save a poster
def download_poster(movie_title, poster_url):
    try:
        response = requests.get(poster_url, stream=True)
        response.raise_for_status()  # Raise an error if request fails
        
        # Clean movie title for filename
        safe_title = "".join(c if c.isalnum() else "_" for c in movie_title)
        file_path = os.path.join(poster_dir, f"{safe_title}.jpg")
        
        # Save the image
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        
        print(f"Downloaded: {movie_title}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {movie_title}: {e}")

# Select the first 10 movies
for index, row in movies_df.head(10).iterrows():
    if row["poster_path"]:  # Ensure there is a valid poster URL
        download_poster(row["title"], row["poster_path"])

print("All posters downloaded successfully.")


Downloaded: The Gorge
Downloaded: Flight Risk
Downloaded: Sonic the Hedgehog 3
Downloaded: Companion
Downloaded: Captain America: Brave New World
Downloaded: Panda Plan
Downloaded: Dog Man
Downloaded: My Fault: London
Downloaded: Kraven the Hunter
Downloaded: The Brutalist
All posters downloaded successfully.
