# Data Wrangling

## Extracting the top 100 E-books from gutenberg

This example uses BeautifulSoup tp parse the HTML and regular expression code to identify the Top 100 eBook file numbers.

In [3]:
# Import the necessary libraries, including regex and BeautifulSong

In [22]:
# Building your own movie database by reading an API

In [23]:
import urllib.request, urllib.parse, urllib.error
import json

In [24]:
# Load the secret API key (you have to get one from the OMDB website and use that; it has a 1,000 daily limit) from a JSON file, stored in the same folder into a variable, by using json.loads().
# Note The following cell will not be executed in the solution notebook because the author cannot give out their private API key. The students/users/instructor will need to obtain a key and store it in a JSON file. We are calling this file APIkeys.json.

In [25]:
# Open APIkeys.json file to read the key
with open('APIkeys.json') as f:
    keys = json.load(f)
    omdbapi = keys['OMDBapi']

In [26]:
serviceurl = 'http://www.omdbapi.com/?'


In [27]:
apikey = '&apikey='+omdbapi

In [28]:
# print the movie data from a JSON file.
def print_json(json_data):
    list_keys=['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 
               'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Ratings', 
               'Metascore', 'imdbRating', 'imdbVotes', 'imdbID']
    print("-"*50)
    for k in list_keys:
        if k in list(json_data.keys()):
            print(f"{k}: {json_data[k]}")
    print("-"*50)

In [29]:
# function to download the poster movie based on the information from the JSON dataset and save it to local folder.
def save_poster(json_data):
    import os
    title = json_data['Title']
    poster_url = json_data['Poster']
    # Splits the poster url by '.' and picks up the last string as file extension
    poster_file_extension=poster_url.split('.')[-1]
    # Reads the image file from web
    poster_data = urllib.request.urlopen(poster_url).read()
        
    savelocation=os.getcwd()+'\\'+'Posters'+'\\'
    # Creates new directory if the directory does not exist. Otherwise, just use the existing path.
    if not os.path.isdir(savelocation):
        os.mkdir(savelocation)
    
    filename=savelocation+str(title)+'.'+poster_file_extension
    f=open(filename,'wb')
    f.write(poster_data)
    f.close()

In [30]:
# Function to search a moview by its name.
def search_movie(title):
    try:
        url = serviceurl + urllib.parse.urlencode({'t': str(title)})+apikey
        print(f'Retrieving the data of "{title}" now... ')
        print(url)
        uh = urllib.request.urlopen(url)
        data = uh.read()
        json_data=json.loads(data)
        
        if json_data['Response']=='True':
            print_json(json_data)
            # Asks user whether to download the poster of the movie
            if json_data['Poster']!='N/A':
                save_poster(json_data)
        else:
            print("Error encountered: ",json_data['Error'])
    
    except urllib.error.URLError as e:
        print(f"ERROR: {e.reason}")

In [31]:
#  Search the movie ith name "Titanic"
search_movie("Titanic")


Retrieving the data of "Titanic" now... 
http://www.omdbapi.com/?t=Titanic&apikey=856a3424
--------------------------------------------------
Title: Titanic
Year: 1997
Rated: PG-13
Released: 19 Dec 1997
Runtime: 194 min
Genre: Drama, Romance
Director: James Cameron
Writer: James Cameron
Actors: Leonardo DiCaprio, Kate Winslet, Billy Zane
Plot: A seventeen-year-old aristocrat falls in love with a kind but poor artist aboard the luxurious, ill-fated R.M.S. Titanic.
Language: English, Swedish, Italian, French
Country: United States, Mexico
Awards: Won 11 Oscars. 126 wins & 83 nominations total
Ratings: [{'Source': 'Internet Movie Database', 'Value': '7.9/10'}, {'Source': 'Rotten Tomatoes', 'Value': '88%'}, {'Source': 'Metacritic', 'Value': '75/100'}]
Metascore: 75
imdbRating: 7.9
imdbVotes: 1,322,762
imdbID: tt0120338
--------------------------------------------------


In [32]:
#  Search the movie ith name "Random_error"

search_movie("Random_error")


Retrieving the data of "Random_error" now... 
http://www.omdbapi.com/?t=Random_error&apikey=856a3424
Error encountered:  Movie not found!


## Connect to an API of your choice and do a simple data pull

Connect to the API and do a "Get" call/operation on the API to return a subset of data from the API

In [34]:
# Fetch Weather Data via API
# This script:

# Fetches the weather for New York.
# Extracts temperature and weather conditions.
# Handles errors if the API request fails.

In [35]:
import requests

# Define API Key and Endpoint
api_key = "5789916fb6656a55cb71542f2c08fb38"
city = "New York"
base_url = "https://api.openweathermap.org/data/2.5/weather"

# Construct the API request URL
url = f"{base_url}?q={city}&appid={api_key}&units=metric"

# Send a GET request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse JSON response
    data = response.json()
    
    # Extract relevant information
    weather_info = {
        "City": data["name"],
        "Temperature (°C)": data["main"]["temp"],
        "Weather": data["weather"][0]["description"]
    }
    
    # Print extracted data
    print(weather_info)
else:
    print("Error fetching data. Check API key and city name.")


{'City': 'New York', 'Temperature (°C)': -0.02, 'Weather': 'clear sky'}
