### Importing Libraries

In [105]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

### `open_web_page`

The `open_web_page` function uses a Chrome driver (via Selenium) to access a webpage specified by the provided URL argument. This function launches a Google Chrome browser instance and opens the corresponding page.

In [106]:
def open_web_page(url):
    driver = webdriver.Chrome() 
    driver.get(url)
    return driver

### `extract_movie_info`

The `extract_movie_info` function extracts information about movies from a webpage using a web driver. It retrieves data such as the movie title, release year, duration, star rating, number of reviewers, and the movie's image URL. This data is stored in a global list info. After collecting the information, the function closes the browser.

In [107]:
info = []
def extract_movie_info(driver):
    movies = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/div[3]/section/div/div[2]/div/ul/li')
    for movie in movies:
        image_url = movie.find_element(By.XPATH, 'div[1]/div/div[2]/img').get_attribute('src')
        title =movie.find_element(By.XPATH, 'div[2]/div/div/div[1]/a/h3').text.split('. ')[1:][0]
        year = movie.find_element(By.XPATH, 'div[2]/div/div/div[2]/span[1]').text
        duration = movie.find_element(By.XPATH,'div[2]/div/div/div[2]/span[2]').text
        stars = movie.find_element(By.XPATH, 'div[2]/div/div/span/div/span/span[1]').text
        people = movie.find_element(By.XPATH, 'div[2]/div/div/span/div/span/span[2]').text.replace('(','').replace(')','')
        info.append([title, year, duration,stars, people,image_url])

    driver.quit()

### Execution

In [108]:
url = 'https://www.imdb.com/chart/top/'
driver = open_web_page(url)

In [109]:
extract_movie_info(driver)

In [110]:
df = pd.DataFrame(info, columns=['title', 'year', 'duration','stars', 'people','image_url'])

In [111]:
df.head()

Unnamed: 0,title,year,duration,stars,people,image_url
0,Cadena perpetua,1994,2h 22m,93,"2,9 M",https://m.media-amazon.com/images/M/MV5BMTA1Mj...
1,El padrino,1972,2h 55m,92,"2,1 M",https://m.media-amazon.com/images/M/MV5BNWYxYz...
2,El caballero oscuro,2008,2h 32m,90,"2,9 M",https://m.media-amazon.com/images/M/MV5BMTMxNT...
3,El padrino parte II,1974,3h 22m,90,"1,4 M",https://m.media-amazon.com/images/M/MV5BMjQ5Mz...
4,12 hombres sin piedad,1957,1h 36m,90,887 mil,https://m.media-amazon.com/images/M/MV5BYThhOG...


In [112]:
df.to_csv('data.csv')