<a href="https://colab.research.google.com/github/kebab27/Programming/blob/main/IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import requests
from bs4 import BeautifulSoup
def scrape_movie_data():
    base_url = "https://www.imdb.com/chart/top"
    response = requests.get(base_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        movie_data = []
        for movie in soup.select("td.titleColumn"):
            title = movie.find("a").get_text()
            year = int(movie.find("span").get_text()[1:-1])
            rating = float(movie.find_next("td", class_="ratingColumn imdbRating").strong.get_text())
            movie_data.append({'title': title, 'year': year, 'rating': rating})
        return movie_data
    else:
        print("Failed to retrieve data from IMDb.")
        return None

      movies = scrape_movie_data()

      #Data Preprocessing and Feature Extraction

      import pandas as pd
def preprocess_and_extract_features(movie_data):
    # Create a DataFrame from the scraped data
    df = pd.DataFrame(movie_data)

    df['genres'] = ['Action, Adventure', 'Drama', 'Action, Adventure, Sci-Fi', 'Crime, Drama, Thriller', ...]
    df['runtime'] = [142, 195, 148, 175, ...]

    return df

movies_df = preprocess_and_extract_features(movies)

#Loading Data into SQLite Database

import sqlite3

def create_database(database_name):
    conn = sqlite3.connect(database_name)
    cursor = conn.cursor()

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS movies (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            year INTEGER NOT NULL,
            rating REAL NOT NULL,
            genres TEXT NOT NULL,
            runtime INTEGER NOT NULL
        );
    ''')
    conn.commit()
    conn.close()

def insert_data_to_database(dataframe, database_name):
    conn = sqlite3.connect(database_name)
    dataframe.to_sql('movies', conn, if_exists='append', index=False)
    conn.commit()
    conn.close()

database_name = 'movie_database.db'
create_database(database_name)
insert_data_to_database(movies_df, database_name)

#Data Analysis and Answering Questions

def perform_data_analysis(database_name):
    conn = sqlite3.connect(database_name)


    query = '''
        SELECT title, year, rating
        FROM movies
        ORDER BY rating DESC
        LIMIT 10;
    '''
    top_rated_movies = pd.read_sql_query(query, conn)
    conn.close()

    return top_rated_movies


top_10_movies = perform_data_analysis(database_name)
print(top_10_movies)



IndentationError: ignored

For this data pipeline project, I will create a pipeline to analyze movie data. We will use web scraping to acquire movie information from a movie database (IMDb), preprocess the data, extract relevant features, and store it in a SQLite database. We'll then perform some data analysis on the dataset to answer questions of interest related to the movies.

Python libraries used in this project:

Requests: To make HTTP requests to IMDb website for web scraping.
BeautifulSoup: For parsing the HTML content and extracting relevant data from web pages.
Pandas: For data manipulation and analysis.
SQLite3: For creating and managing the database.
Assumptions:

We are interested in movies released up to the knowledge cutoff date (September 2021).
We will focus on data related to movie titles, release years, IMDb ratings, genres, and runtime.


In this pipeline, we acquired movie data by web scraping IMDb's top-rated movies, extracted relevant features (genres and runtime) using placeholder data (replace with actual API calls in practice), and loaded the data into an SQLite database. Finally, we performed data analysis to get the top 10 highest-rated movies.