In [None]:
# neccesary libraries
!pip install --upgrade pip
!pip install requests
!pip install beautifulsoup4
!pip install pandas

In [3]:
# import libraries
import requests
from bs4 import BeautifulSoup

def get_web_html(url: str) -> BeautifulSoup:
    """
    This method gets the HTML from a website using scrapping.

    Args:
        url: URL to scrape.

    Returns:
        A BeautifulSoup object with the HTML.
    """
    # headers to avoid 403 error, cos' IMDb blocks requests from bots
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    # get HTML and save it in a variable
    response = requests.get(url, headers=headers)
    return BeautifulSoup(response.text, "html.parser")

In [14]:
# import libraries
from bs4 import BeautifulSoup


def get_title_year(movie_data: BeautifulSoup) -> tuple:
    """
    This methog gets the title, year, and link of a movie data using scrapping.

    Args:
        movie_data: BeautifulSoup object with movie data.

    Returns:
        A tuple with title, year, and link.
    """
    # process main title of subdivission
    base_url = "https://www.imdb.com"
    ref_data = movie_data.find("a")
    title = ref_data.text.split("(")[0].strip()
    year = ref_data.text.split("(")[1].replace(")", "")
    link = base_url + ref_data["href"]
    return title, year, link


def get_genre_actors(movie_data: BeautifulSoup) -> list:
    """
    This method gets genre and actors from a movie data using scrapping.

    Args:
        movie_data: BeautifulSoup object with movie data.

    Returns:
        A list with genre and actors.
    """
    # process ul tags to get genre and actors
    data = []
    ul_tags = movie_data.find_all("ul")
    for ul_tag in ul_tags:
        temp = []
        for li_tag in ul_tag.find_all("li"):
            temp.append(li_tag.find("span").get_text())
        data.append(", ".join(temp))
    return data



In [17]:
# import libraries
from bs4 import BeautifulSoup
import pandas as pd


def generate_dataframe(url: str) -> pd.DataFrame:
    """
    This method generates a DataFrame with movie data from IMDb.

    Args:
        url: URL to scrape.

    Returns:
        A DataFrame with movie data.
    """
    # movies datastructure definition
    imdb = get_web_html(url)
    movies = []
    movies_metadata = ["Title", "Year", "Genre", "Actors"]

    # process HTML using scrapping going to each division with the class ipc-metadata-list-summary-item__tc
    movies_html = imdb.find_all("div", class_="ipc-metadata-list-summary-item__tc")
    for movie in movies_html:
        # get each movie data into a clean html structure
        movie_data = BeautifulSoup(str(movie), "html.parser")

        # get movie data
        title, year, link = get_title_year(movie_data)
        data = get_genre_actors(movie_data)

        # create a dictionary to have a nice data movie structure
        movie_clean_data = {
            "Title": title,
            "Year": year,
            "Genre": data[0],
            "Actors": data[1] if len(data) > 1 else "",
        }
        # create a list of dictionaries to create a DataFrame
        movies.append(movie_clean_data)

    # create movies dataframe
    return pd.DataFrame(movies, columns=movies_metadata)

# ================================ MAIN =================================== #
# url to scrape
url = "https://www.imdb.com/calendar/?ref_=rlm&region=US&type=MOVIE"
movies_df = generate_dataframe(url)
print(movies_df.head(3))


                     Title  Year                         Genre  \
0  Furiosa: A Mad Max Saga  2024     Action, Adventure, Sci-Fi   
1       The Garfield Movie  2024  Animation, Adventure, Comedy   
2                    Sight  2023     Biography, Drama, History   

                                              Actors  
0  Anya Taylor-Joy, Chris Hemsworth, Tom Burke, A...  
1  Chris Pratt, Samuel L. Jackson, Hannah Wadding...  
2  Terry Chen, Greg Kinnear, Natasha Mumba, Fionn...  


In [30]:
filtered_movies_df = movies_df[movies_df['Year'] == '2025']
print(filtered_movies_df.head(3))

           Title  Year           Genre  \
144   Screamboat  2025  Comedy, Horror   
145        Grind  2025          Horror   
146  In the Grey  2025          Action   

                                                Actors  
144                                                     
145  Ginger Lynn, Felissa Rose, Lynn Lowry, August ...  
146  Eiza González, Henry Cavill, Jake Gyllenhaal, ...  


In [38]:
generos_busqueda = ['Thriller', 'Drama', 'Terror', 'Black Humor', 'Documental', 'Biography']

# Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
filtered_movies_df = movies_df[movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))]
print(filtered_movies_df.head(5))

                                         Title  Year  \
2                                        Sight  2023   
3  Kidnapped: The Abduction of Edgardo Mortara  2023   
4                           Queen of the Deuce  2022   
6                                Terra Infirma  2024   
7                          In a Violent Nature  2024   

                       Genre  \
2  Biography, Drama, History   
3             Drama, History   
4     Documentary, Biography   
6  Action, Thriller, Western   
7    Drama, Horror, Thriller   

                                              Actors  
2  Terry Chen, Greg Kinnear, Natasha Mumba, Fionn...  
3  Paolo Pierobon, Fausto Russo Alesi, Barbara Ro...  
4  Chelly Wilson, Bondi Walters, Don Walters, Pau...  
6  Bruce Greenwood, Adam Beach, Wes Studi, Gísli ...  
7  Ry Barrett, Andrea Pavlovic, Cameron Love, Ree...  


In [44]:
Message = """

Welcome to the choice movies 3000

To analyze will be your next movie to watch on the cinema

First, answer

Will you go with someone under 15 years?

1) Yes
2) No
3) Exit
"""


message_year = """

  There are so many movies incoming, please select the year of your preference

  1) 2022
  2) 2023
  3) 2024
  4) 2025
  5) 2026

  """





print(Message)
option = int(input("Please select an option: "))
if option == 1:
        print("""Some of the suggestions for you and the child are:
                  Animation
                  Adventure
                  Fantasy
                  Sci-fi
                  Familiar
              """)
        generos_busqueda = ['Animation', 'Adventure', 'Fantasy', 'Sci-fi', 'Familiar']
        print(message_year)
        option1 = int(input("Select Option:"))
        if option1 == 1:

            year = 2022

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))
        elif option1 == 2:
            year = 2023

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))
        elif option1 == 3:
            year = 2024

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))
        elif option1 == 4:
            year = 2025

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))
        elif option1 == 5:
            year = 2025

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))
elif option == 2:
        print("""Some of the suggestions for you and your family are:
                  Thriller
                  Drama
                  Terror
                  Black Comedy
                  Documental
                  Biography
              """)
        generos_busqueda = ['Thriller', 'Drama', 'Terror', 'Black Humor', 'Documental', 'Biography']
        print(message_year)
        option2 = int(input("Select Option:"))
        if option2 == 1:

            year = 2022

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))
        elif option2 == 2:
            year = 2023

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))

        elif option2 == 3:
            year = 2024

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))

        elif option2 == 4:
            year = 2025

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))
        elif option2 == 5:
            year = 2025

            # Utiliza la función any() para verificar si alguna de las palabras clave está presente en el género
            # También filtra por el año especificado
            filtered_movies_df = movies_df[(movies_df['Genre'].apply(lambda x: any(gen in x for gen in generos_busqueda))) &
                                            (movies_df['Year'] == year)]
            print(filtered_movies_df.head(5))

elif option == 3:
        print("Have a nice day")



Welcome to the choice movies 3000

To analyze will be your next movie to watch on the cinema

First, answer

Will you go with someone under 15 years?

1) Yes
2) No
3) Exit

Please select an option: 1
Some of the suggestions for you and the child are:
                  Animation
                  Adventure
                  Fantasy
                  Sci-fi
                  Familiar
              


  There are so many movies incoming, please select the year of your preference

  1) 2022
  2) 2023
  3) 2024
  4) 2025
  5) 2026

  
Select Option:3
Empty DataFrame
Columns: [Title, Year, Genre, Actors]
Index: []
