# Making my movie and tv show recommendations site

#### Step 0: Import necessary packages to scrape my IMDB lists

In [98]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

### Step 1: Scrape Movies

In [48]:
# Opening HTML file localy
HTMLFileToBeOpened = open("movies_source.html", "r")
# Reading the file and storing it in a variable
contents = HTMLFileToBeOpened.read()
# Creating beautifulsoup object and specifying the parser
SoupText = BeautifulSoup(contents, 'lxml')

In [133]:
# Define the URL you want to fetch
m_url = "https://www.imdb.com/list/ls521504128/?ref_=otl_2"

# Send an HTTP GET request to the URL
response = requests.get(m_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the HTML content of the page
    movies_contents = response.text
    
    # Create a BeautifulSoup object and specify the parser
    SoupText = BeautifulSoup(movies_contents, 'lxml')
    
    # Now you can work with the parsed HTML content
    # (e.g., extract data or navigate the DOM)
    
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)


In [134]:
bigsoup = SoupText.find('div', {'class': 'lister-list'})

In [135]:
# Scraping the Titles
titles = []

for h3 in bigsoup.find_all('h3', class_='lister-item-header'):
    titles.append(h3.a.get_text(strip=True))

In [136]:
year_out = []
# Scraping the year it went out
for year in bigsoup.find_all('span', class_='lister-item-year'):
    year_out.append(year.get_text(strip=True))


In [137]:
runtime = []
# Scraping duration of film
for duration in bigsoup.find_all('span', class_='runtime'):
    runtime.append(duration.get_text(strip=True))

In [138]:
description = []
# Scraping the description
for summary in bigsoup.find_all('p',class_=''):
    description.append(summary.get_text(strip=True))

In [139]:
genre = []
# Scraping genres
for movie_genre in bigsoup.find_all('span', class_='genre'):
    genre.append(movie_genre.get_text(strip=True))

In [140]:
directors = []
# Extract director names using list comprehension and filter out empty strings
directors = [director.a.string.strip() for director in bigsoup.find_all('p', class_='text-muted') if director.a]


In [141]:
# Extract star names
star_elements = bigsoup.select('.text-muted.text-small a[href^="/name/"]')

# Extract the text (star names)
stars = [star.string.strip() for star in star_elements]

In [142]:
images = []
for img in bigsoup.find_all('img', class_='loadlate'):
    loadlate = img.get('loadlate')
    if loadlate:
        images.append(loadlate)

In [143]:
# Create a dictionary from the lists
data = {
    "titles": titles,
    "year_released": year_out,
    "runtime": runtime,
    "description": description,
    "genres": genre,
    # "directors": directors,
    "images": images
}

# Create a DataFrame from the dictionary
movies_df = pd.DataFrame(data)

In [None]:
# Assuming you have a DataFrame named series_df and want to replace a value in the 'titles' column
movies_df['titles'][0] = movies_df['titles'][0].replace('بين النجوم','Interstellar')


#### Save my dataframe into a csv

In [144]:
import os

# Define the folder path where you want to save the CSV file
folder_path = 'csv'

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Define the full path to the CSV file
csv_file_path = os.path.join(folder_path, 'movies.csv')

# Save the DataFrame to the CSV file
movies_df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the DataFrame index in the CSV file


In [113]:
# Define the URL you want to fetch
url = "https://www.imdb.com/list/ls521599999/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the HTML content of the page
    series_contents = response.text
    
    # Create a BeautifulSoup object and specify the parser
    series_bigsoup = BeautifulSoup(series_contents, 'lxml')
    
    # Now you can work with the parsed HTML content
    # (e.g., extract data or navigate the DOM)
    
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)


### Step 2: Scrape the tv show list on IMDB

In [99]:
# Opening HTML file localy
series_html = open("/Users/joaquinortega/code/joaquin-ortega84/video-recs/project/data/series_source.html", "r")
# Reading the file and storing it in a variable
series_contents = series_html.read()
# Creating beautifulsoup object and specifying the parser
series_bigsoup = BeautifulSoup(series_contents, 'lxml')

In [114]:
series_text = series_bigsoup.find('div', {'class': 'lister-list'})

In [115]:
# Scraping the Titles
s_titles = []

for h3 in series_text.find_all('h3', class_='lister-item-header'):
    s_titles.append(h3.a.get_text(strip=True))

In [116]:
s_year_out = []
# Scraping the year it went out
for year in series_text.find_all('span', class_='lister-item-year'):
    s_year_out.append(year.get_text(strip=True))


In [117]:
s_runtime = []
# Scraping duration of film
for duration in series_text.find_all('span', class_='runtime'):
    s_runtime.append(duration.get_text(strip=True))

In [118]:
s_description = []
# Scraping the description
for summary in series_text.find_all('p',class_=''):
    s_description.append(summary.get_text(strip=True))

In [119]:
s_genre = []
# Scraping genres
for movie_genre in series_text.find_all('span', class_='genre'):
    s_genre.append(movie_genre.get_text(strip=True))

In [120]:
s_directors = []
# Extract director names using list comprehension and filter out empty strings
s_directors = [director.a.string.strip() for director in series_text.find_all('p', class_='text-muted') if director.a]


In [121]:
# Extract star names
s_star_elements = series_text.select('.text-muted.text-small a[href^="/name/"]')

# Extract the text (star names)
s_stars = [star.string.strip() for star in s_star_elements]

In [122]:
s_images = []
for img in series_text.find_all('img', class_='loadlate'):
    loadlate = img.get('loadlate')
    if loadlate:
        s_images.append(loadlate)

In [123]:
# Create a dictionary from the lists
s_data = {
    "titles": s_titles,
    "year_released": s_year_out,
    "runtime": s_runtime,
    "description": s_description,
    "genres": s_genre,
    # "directors": directors,
    "images": s_images
}

# Create a DataFrame from the dictionary
series_df = pd.DataFrame(s_data)

In [130]:
# Assuming you have a DataFrame named series_df and want to replace a value in the 'titles' column
series_df['titles'][13] = series_df['titles'][13].replace('صراع العروش', 'Game of Thrones')


#### Save my dataframe into a csv

In [132]:
import os

# Define the folder path where you want to save the CSV file
folder_path = '/Users/joaquinortega/code/joaquin-ortega84/video-recs/project/data/csv'

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Define the full path to the CSV file
csv_file_path = os.path.join(folder_path, 'series.csv')

# Save the DataFrame to the CSV file
series_df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the DataFrame index in the CSV file
