Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 51 additions & 29 deletions Movie Scraper/scrape.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,84 @@
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent

def requestUrl_and_bs4(url:str):
# All request and parser goes through here
agents = UserAgent().random
user_agent={"User-Agent": agents}

def getMovieDetails(movieName):
# Fixed the requests (changed status code from 403 to 200)
html = requests.get(url, headers=user_agent)
soup = BeautifulSoup(html.text, 'html.parser')

return soup


def getMovieDetails(movieName:str):
url = 'https://www.imdb.com'
query = '/search/title?title='
movieDetails = {}
movienamequery = query+'+'.join(movieName.strip().split(' '))
website_url = url+movienamequery+'&title_type=feature'

html = requests.get(url+movienamequery+'&title_type=feature')
bs = BeautifulSoup(html.text, 'html.parser')
result = bs.find('h3', {'class': 'lister-item-header'})
bs = requestUrl_and_bs4(website_url)

result = bs.find('a', {'class': 'ipc-title-link-wrapper'})
if result is None:
return None

movielink = url+result.a.attrs['href']
movieDetails['name'] = result.a.text
movielink = url+result.attrs['href']

html = requests.get(movielink)
bs = BeautifulSoup(html.text, 'html.parser')
bs = requestUrl_and_bs4(movielink)

# Fix the movie name
movieDetails['name'] = bs.find('h1', {'data-testid': 'hero__pageTitle'}).text

# Fix year, runtime
box_one = bs.find('div', {'class': 'sc-b7c53eda-0 dUpRPQ'}).ul
box = box_one.find_all('li')
try:
movieDetails['year'] = bs.find('span', {'id': 'titleYear'}).a.text
movieDetails['year'] = box[0].text
except AttributeError:
movieDetails['year'] = 'Not available'
subtext = bs.find('div', {'class': 'subtext'})

# Fix genres
box_two = bs.find('div', {'data-testid': "genres"})
movieDetails['genres'] = [
i.text for i in subtext.findAll('a', {'title': None})]
i.text for i in box_two.select('div.ipc-chip-list__scroller>a>span')]

# Fix ratings
try:
movieDetails['rating'] = bs.find(
'div', {'class': 'ratingValue'}).span.text
movieDetails['runtime'] = subtext.time.text.strip()
movieDetails['rating'] = f"{bs.find(
'div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text}/10 ({bs.find('div', {'class': 'sc-bde20123-3 gPVQxL'}).text})"
movieDetails['runtime'] = box[2].text.strip()
except AttributeError:
movieDetails['rating'] = 'Not yet rated'
movieDetails['runtime'] = 'Not available'
movieDetails['release_date'] = subtext.find(
'a', {'title': 'See more release dates'}).text.strip()

creditSummary = bs.findAll('div', {'class': 'credit_summary_item'})
# To get movie release date
movie_release_dates_url= f"{url}{box[0].a.attrs['href']}"
soup = requestUrl_and_bs4(movie_release_dates_url)

movieDetails['release_date'] = soup.select_one('#rel_1 > div > ul > li > span.ipc-metadata-list-item__list-content-item').text

movieDetails['directors'] = [i.text for i in creditSummary[0].findAll('a')]
movieDetails['writers'] = [i.text for i in creditSummary[1].findAll(
'a') if 'name' in i.attrs['href']]
creditSummary = bs.select('div.ipc-metadata-list-item__content-container > ul')

movieDetails['directors'] = [i.text for i in creditSummary[0].select('li>a')]

try:
movieDetails['cast'] = [i.text for i in creditSummary[2].findAll(
'a') if 'name' in i.attrs['href']]
movieDetails['cast'] = [i.text for i in creditSummary[2].select('li>a')]
movieDetails['writers'] = [i.text for i in creditSummary[1].select('li>a')]

except IndexError:
movieDetails['cast']=movieDetails['writers']
movieDetails['writers']='Not found'
html = requests.get(movielink+'plotsummary')
bs = BeautifulSoup(html.text, 'html.parser')

movieDetails['plot'] = bs.find(
'li', {'class': 'ipl-zebra-list__item'}).p.text.strip()

movieDetails['plot'] = bs.find('span', {'data-testid': 'plot-l'}).text.strip()

return movieDetails


if __name__ == "__main__":
def main():
movieName = input('Enter the movie name : \n')
movieDetails = getMovieDetails(movieName)
if movieDetails is None:
Expand All @@ -75,3 +94,6 @@ def getMovieDetails(movieName):
print('Writer:', ', '.join(movieDetails['writers']))
print('Cast:', ', '.join(movieDetails['cast']))
print('Plot Summary:\n', movieDetails['plot'])

if __name__ == "__main__":
main()