## Task:

1) In this project your task is to do web scrapping for the below link provided, and store the data in the CSV file.

2) After scrapping the data You are required to load the data in pandas and do some EDA and provide some useful graphical information.

Link to scrap data: https://www.themoviedb.org/movie

### Data

1. Name
2. Rating
3. Genre
4. Release date
5. Runtime
6. Director
7. Url

# Task 1 : Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://www.themoviedb.org'

In [3]:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}

In [4]:
response = requests.get(url+'/movie', headers = header)

In [5]:
response.status_code

200

# Page Urls

In [6]:
urls = []
for i in range(1,501):
    urls.append(url+'/movie?page='+str(i))

In [7]:
len(urls)

500

In [8]:
urls[0]

'https://www.themoviedb.org/movie?page=1'

In [9]:
url_response = requests.get(urls[0], headers=header)
url_html = url_response.text
url_soup = BeautifulSoup(url_html, 'lxml')

In [10]:
movies_links = []
for url_link in urls:
    url_response = requests.get(url_link, headers=header)
    url_html = url_response.text
    url_soup = BeautifulSoup(url_html, 'lxml')
    
    for link in url_soup.find('div', class_='page_wrapper').find_all('h2'):
        movies_links.append(url+link.a['href'])

In [11]:
len(movies_links)

10000

## Sample Movie

In [12]:
movies_links[0]

'https://www.themoviedb.org/movie/634649'

In [13]:
response2 = requests.get(movies_links[0], headers=header)

In [14]:
response2.status_code

200

In [15]:
response_html2 = response2.text

In [16]:
soup2 = BeautifulSoup(response_html2, 'lxml')

### Name

In [17]:
soup2.find('h2').a.text

'Spider-Man: No Way Home'

### Rating

In [18]:
soup2.find('div', class_='percent').span['class'][-1].replace('icon-r','')

'83'

### Genre

In [19]:
soup2.find('span', class_='genres').text.strip().replace('\xa0','')

'Action,Adventure,Science Fiction'

### Release date

In [20]:
soup2.find('span', class_='release').text.strip().split()[0]

'12/17/2021'

### Runtime

In [21]:
try:
    r = soup2.find('span', class_='runtime').text.strip()
except:
    r = 0
print(r)

2h 28m


### Director

In [22]:
for profile in soup2.find_all('li', class_='profile'):
    if 'Director' in profile.text:
        director = profile.a.text
director

'Jon Watts'

### Url

In [23]:
movies_links[0]

'https://www.themoviedb.org/movie/634649'

# Final Code

In [33]:
movie_list = []
def movies_scrape():
    url = 'https://www.themoviedb.org'
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
    page_response = requests.get(url+'/movie', headers = header)
    page_html = page_response.text
    page_soup = BeautifulSoup(page_html, 'lxml')
    
    c = 0
    for link in movies_links:
        movie_url = link
        print(c,end = ' ')
        c+=1
        movie_response = requests.get(movie_url, headers = header)
        movie_html = movie_response.text
        movie_soup = BeautifulSoup(movie_html, 'lxml')
        
        name = movie_soup.find('h2').a.text #name

        rating = movie_soup.find('div', class_='percent').span['class'][-1].replace('icon-r','') #rating
        
        genre = movie_soup.find('span', class_='genres').text.strip().replace('\xa0','') #genre
        
        release_date = movie_soup.find('span', class_='release').text.strip().split()[0] #release_date
        
        try:
            runtime = movie_soup.find('span', class_='runtime').text.strip() #runtime
        except:
            runtime = 0
            
        for profile in movie_soup.find_all('li', class_='profile'): #director
            if 'Director' in profile.text:
                director = profile.a.text
        
        movie_dict = {
            'Name':name,
            'Rating':rating,
            'Genre':genre,
            'Release Date':release_date,
            'Runtime':runtime,
            'Url':movie_url
        }
        movie_list.append(movie_dict)

In [38]:
# movies_scrape()

In [39]:
len(movie_list)

10000

In [40]:
movies_df = pd.DataFrame(movie_list)

In [41]:
movies_df

Unnamed: 0,Name,Rating,Genre,Release Date,Runtime,Url
0,Spider-Man: No Way Home,83,"Action,Adventure,Science Fiction",12/17/2021,2h 28m,https://www.themoviedb.org/movie/634649
1,Turning Red,75,"Animation,Family,Comedy,Fantasy",03/10/2022,1h 40m,https://www.themoviedb.org/movie/508947
2,Blacklight,55,"Action,Thriller",02/10/2022,1h 44m,https://www.themoviedb.org/movie/823625
3,The Batman,80,"Crime,Mystery,Thriller",03/04/2022,2h 56m,https://www.themoviedb.org/movie/414906
4,Encanto,77,"Animation,Comedy,Family,Fantasy",11/24/2021,1h 42m,https://www.themoviedb.org/movie/568124
...,...,...,...,...,...,...
9995,Flyboys,65,"Action,Adventure,Drama,History,Romance,War",09/22/2006,2h 18m,https://www.themoviedb.org/movie/9664
9996,K-PAX,72,"Science Fiction,Drama",10/22/2001,2h,https://www.themoviedb.org/movie/167
9997,Seberg,60,"Thriller,Drama",01/10/2020,1h 43m,https://www.themoviedb.org/movie/510298
9998,How to Steal a Million,75,"Comedy,Crime",07/13/1966,2h 3m,https://www.themoviedb.org/movie/3001


### Save in csv file

In [44]:
movies_df.to_csv('movies_data.csv', index=None)