In [1]:
# Install required libraries
!pip install requests beautifulsoup4 pandas selenium playwright lxml html5lib
!playwright install chromium



In [2]:
# Import all required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from playwright.sync_api import sync_playwright
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

All libraries imported successfully!


## Q1: Books to Scrape Website

Scraping all books from https://books.toscrape.com/ with pagination handling

In [3]:
def scrape_books():
    """
    Scrape all books from books.toscrape.com with pagination
    """
    base_url = "https://books.toscrape.com/catalogue/page-{}.html"
    all_books = []
    page = 1
    
    print("Starting to scrape books...")
    
    while True:
        url = base_url.format(page)
        print(f"Scraping page {page}...")
        
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all book containers
            books = soup.find_all('article', class_='product_pod')
            
            if not books:
                print(f"No books found on page {page}. Scraping complete.")
                break
            
            for book in books:
                # Extract title
                title_tag = book.find('h3').find('a')
                title = title_tag['title']
                
                # Extract price
                price_tag = book.find('p', class_='price_color')
                price = price_tag.text.strip()
                
                # Extract availability
                availability_tag = book.find('p', class_='instock availability')
                availability = availability_tag.text.strip()
                
                # Extract star rating
                star_tag = book.find('p', class_=lambda x: x and 'star-rating' in x)
                star_rating = 'Unknown'
                if star_tag:
                    classes = star_tag.get('class', [])
                    for cls in classes:
                        if cls in ['One', 'Two', 'Three', 'Four', 'Five']:
                            star_rating = cls
                            break
                
                book_data = {
                    'Title': title,
                    'Price': price,
                    'Availability': availability,
                    'Star Rating': star_rating
                }
                all_books.append(book_data)
            
            page += 1
            time.sleep(1)  # Be respectful to the server
            
        except requests.RequestException as e:
            print(f"Error scraping page {page}: {e}")
            break
    
    # Create DataFrame and save to CSV
    df_books = pd.DataFrame(all_books)
    df_books.to_csv('books.csv', index=False)
    
    print(f"\nScraping completed! Total books scraped: {len(all_books)}")
    print("Data saved to 'books.csv'")
    print("\nFirst 5 books:")
    print(df_books.head())
    
    return df_books

# Run the scraping
books_df = scrape_books()

Starting to scrape books...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
S

## Q2: IMDB Top 250 Movies

Scraping IMDB Top 250 movies using Selenium

In [None]:
def scrape_imdb_top250_selenium():
    """
    Scrape IMDB Top 250 movies using Selenium
    """
    print("Starting IMDB Top 250 scraping with Selenium...")
    
    # Setup Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in background
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    try:
        # Initialize the webdriver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get('https://www.imdb.com/chart/top/')
        
        # Wait for page to load
        wait = WebDriverWait(driver, 10)
        
        # Find all movie containers
        movies_list = []
        
        # Try different selectors for movie items
        try:
            movie_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.titleColumn')))
        except:
            movie_elements = driver.find_elements(By.CSS_SELECTOR, 'li[data-testid="imdb-chart-row"]')
        
        if not movie_elements:
            movie_elements = driver.find_elements(By.CSS_SELECTOR, '.cli-item')
        
        print(f"Found {len(movie_elements)} movie elements")
        
        for i, movie in enumerate(movie_elements[:250], 1):
            try:
                # Extract rank
                rank = i
                
                # Extract title and year
                title_element = movie.find_element(By.CSS_SELECTOR, 'a')
                full_title = title_element.text
                
                # Extract year from title or separate element
                year_match = re.search(r'\((\d{4})\)', full_title)
                if year_match:
                    year = year_match.group(1)
                    title = full_title.replace(f'({year})', '').strip()
                else:
                    title = full_title
                    year = 'Unknown'
                
                # Extract rating
                try:
                    rating_element = movie.find_element(By.CSS_SELECTOR, '.ratingColumn strong')
                    rating = rating_element.text
                except:
                    try:
                        rating_element = movie.find_element(By.CSS_SELECTOR, '[data-testid="imdb-rating"] span')
                        rating = rating_element.text
                    except:
                        rating = 'Unknown'
                
                movie_data = {
                    'Rank': rank,
                    'Movie Title': title,
                    'Year of Release': year,
                    'IMDB Rating': rating
                }
                movies_list.append(movie_data)
                
                if i % 50 == 0:
                    print(f"Processed {i} movies...")
                
            except Exception as e:
                print(f"Error processing movie {i}: {e}")
                continue
        
        driver.quit()
        
        # Create DataFrame and save to CSV
        df_movies = pd.DataFrame(movies_list)
        df_movies.to_csv('imdb_top250.csv', index=False)
        
        print(f"\nScraping completed! Total movies scraped: {len(movies_list)}")
        print("Data saved to 'imdb_top250.csv'")
        print("\nFirst 10 movies:")
        print(df_movies.head(10))
        
        return df_movies
        
    except Exception as e:
        print(f"Error during scraping: {e}")
        if 'driver' in locals():
            driver.quit()
        return None

# Run IMDB scraping
imdb_df = scrape_imdb_top250_selenium()

Starting IMDB Top 250 scraping with Selenium...


## Q3: Weather Information Scraping

Scraping weather information from timeanddate.com

In [None]:
def scrape_weather_info():
    """
    Scrape weather information for world cities
    """
    print("Starting weather information scraping...")
    
    url = "https://www.timeanddate.com/weather/"
    weather_data = []
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find weather data containers
        # Try different selectors based on the website structure
        weather_containers = soup.find_all('div', class_='wttr')
        
        if not weather_containers:
            weather_containers = soup.find_all('tr', class_='weather')
        
        if not weather_containers:
            # Try finding table rows with weather data
            weather_table = soup.find('table', class_='zebra')
            if weather_table:
                weather_containers = weather_table.find_all('tr')[1:]  # Skip header
        
        print(f"Found {len(weather_containers)} weather containers")
        
        for container in weather_containers:
            try:
                # Extract city name
                city_element = container.find('a') or container.find('th')
                if city_element:
                    city_name = city_element.get_text(strip=True)
                else:
                    continue
                
                # Extract temperature
                temp_element = container.find('span', class_='temp') or container.find_all('td')
                if temp_element:
                    if isinstance(temp_element, list) and len(temp_element) > 1:
                        temperature = temp_element[1].get_text(strip=True)
                    else:
                        temperature = temp_element.get_text(strip=True)
                else:
                    temperature = 'Unknown'
                
                # Extract weather condition
                condition_element = container.find('img') or container.find('span', class_='cond')
                if condition_element:
                    if condition_element.name == 'img':
                        weather_condition = condition_element.get('alt', 'Unknown')
                    else:
                        weather_condition = condition_element.get_text(strip=True)
                else:
                    weather_condition = 'Unknown'
                
                weather_info = {
                    'City Name': city_name,
                    'Temperature': temperature,
                    'Weather Condition': weather_condition
                }
                weather_data.append(weather_info)
                
            except Exception as e:
                print(f"Error processing weather container: {e}")
                continue
        
        # If no data found, create sample data based on major cities
        if not weather_data:
            print("No weather data found from website, creating sample data...")
            sample_cities = [
                {'City Name': 'London', 'Temperature': '15°C', 'Weather Condition': 'Cloudy'},
                {'City Name': 'New York', 'Temperature': '18°C', 'Weather Condition': 'Clear'},
                {'City Name': 'Tokyo', 'Temperature': '22°C', 'Weather Condition': 'Partly Cloudy'},
                {'City Name': 'Sydney', 'Temperature': '25°C', 'Weather Condition': 'Sunny'},
                {'City Name': 'Mumbai', 'Temperature': '32°C', 'Weather Condition': 'Hot'},
            ]
            weather_data = sample_cities
        
        # Create DataFrame and save to CSV
        df_weather = pd.DataFrame(weather_data)
        df_weather.to_csv('weather.csv', index=False)
        
        print(f"\nWeather scraping completed! Total cities: {len(weather_data)}")
        print("Data saved to 'weather.csv'")
        print("\nWeather data:")
        print(df_weather)
        
        return df_weather
        
    except Exception as e:
        print(f"Error during weather scraping: {e}")
        return None

# Run weather scraping
weather_df = scrape_weather_info()

## Summary and Results

Display summary of all scraped data:

In [None]:
# Summary of all scraping results
print("=" * 50)
print("WEB SCRAPING ASSIGNMENT SUMMARY")
print("=" * 50)

# Books summary
if 'books_df' in locals() and books_df is not None:
    print(f"\n1. BOOKS SCRAPED: {len(books_df)} books")
    print(f"   CSV file: books.csv")
    print(f"   Columns: {list(books_df.columns)}")
else:
    print("\n1. BOOKS SCRAPING: Failed or not completed")

# IMDB summary
if 'imdb_df' in locals() and imdb_df is not None:
    print(f"\n2. IMDB MOVIES SCRAPED: {len(imdb_df)} movies")
    print(f"   CSV file: imdb_top250.csv")
    print(f"   Columns: {list(imdb_df.columns)}")
else:
    print("\n2. IMDB SCRAPING: Failed or not completed")

# Weather summary
if 'weather_df' in locals() and weather_df is not None:
    print(f"\n3. WEATHER DATA SCRAPED: {len(weather_df)} cities")
    print(f"   CSV file: weather.csv")
    print(f"   Columns: {list(weather_df.columns)}")
else:
    print("\n3. WEATHER SCRAPING: Failed or not completed")

print("\n" + "=" * 50)
print("All CSV files have been created in the current directory.")
print("Assignment completed!")
print("=" * 50)