# WEB SCRAPING

In [19]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from requests import get
from time import sleep
from random import randint
from warnings import warn
import matplotlib.pyplot as plt

import pymongo as pym      # Interface with Python <--> MongoDB 
import os                  # find files on system
import csv                 
import json                # convert file to json format


In [3]:
pages = np.arange(1, 20, 50) 
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Mandarin

In [4]:
#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
metascores = []
votes = []

In [14]:
for page in pages:
    
    #get request
    response = get("https://www.imdb.com/search/title?genres=sci-fi&" 
                   + "start=" 
                   + str(page) 
                   + "&explore=title_type,genres&ref_=adv_prv")
    
    sleep(randint(8,15))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #parse the content of current iteration of request
    page_html = BeautifulSoup(response.text, 'html.parser')
        
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    #extract the 50 movies for that page
    for container in movie_containers:

        #conditional for all with metascore
        if container.find('div', class_ = 'ratings-metascore') is not None:

            #title
            title = container.h3.a.text
            titles.append(title)

            #year released
            year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text
            years.append(year)

            #rating
            rating = container.p.find('span', class_= 'certificate').text
            ratings.append(rating)

            #genre
            genre = container.p.find('span', class_ = 'genre').text
            genres.append(genre)

            #runtime
            time = container.p.find('span', class_ = 'runtime').text
            runtimes.append(time)

            #IMDB ratings
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)

            #Metascore
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))

            #Number of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))

In [15]:
sci_fi_df = pd.DataFrame({'movie': titles,
                      'year': years,
                      'rating': ratings,
                      'genre': genres,
                      'runtime_min': runtimes,
                      'imdb': imdb_ratings,
                      'metascore': metascores,
                      'votes': votes}
                      )

sci_fi_df.loc[:, 'year'] = sci_fi_df['year'].str[-5:-1] # two more data transformations after scraping
# Drop 'ovie' bug
# Make year an int
sci_fi_df['n_imdb'] = sci_fi_df['imdb'] * 10
final_df = sci_fi_df.loc[sci_fi_df['year'] != 'ovie'] # One small issue with the scrape on these two movies so just dropping those ones.
final_df.loc[:, 'year'] = pd.to_numeric(final_df['year'])

In [16]:
sci_fi_df

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes,n_imdb
0,Black Panther: Wakanda Forever,2022,PG-13,"\nAction, Adventure, Drama",161 min,7.3,67,109101,73.0
1,Strange World,2022,PG,"\nAnimation, Action, Adventure",102 min,4.8,65,8133,48.0
2,Black Adam,2022,PG-13,"\nAction, Adventure, Fantasy",125 min,6.7,41,151992,67.0
3,Everything Everywhere All at Once,2022,R,"\nAction, Adventure, Comedy",139 min,8.1,81,245790,81.0
4,Avatar,2009,PG-13,"\nAction, Adventure, Fantasy",162 min,7.8,83,1249905,78.0
5,Nope,2022,R,"\nHorror, Mystery, Sci-Fi",130 min,6.9,77,167335,69.0
6,Thor: Love and Thunder,2022,PG-13,"\nAction, Adventure, Comedy",118 min,6.3,57,317085,63.0
7,Black Panther,2018,PG-13,"\nAction, Adventure, Sci-Fi",134 min,7.3,88,768431,73.0
8,Rogue One,2016,PG-13,"\nAction, Adventure, Sci-Fi",133 min,7.8,65,636659,78.0
9,Guardians of the Galaxy,2014,PG-13,"\nAction, Adventure, Comedy",121 min,8.0,76,1175561,80.0


In [20]:
sci_fi_df.to_csv(r'C:\Users\ayari\OneDrive\Bureau\C-ISET\L3\Atelier_BaseDonnéeAvancé\MongoNotebook\DataSet.CSV')

In [21]:
# Create Dict() to send to MongoDB
sci_fi_df_dict =sci_fi_df.to_dict('records')
# fake_users_dict[:2]

In [23]:
# Making a Connection to MongoClient
client = pym.MongoClient('mongodb://localhost:27017/')

# CREATING A DATABASE:
db = client["sci_fi_df"]

# CREATING A COLLECTION (*AKA* TABLE):
user_info_table= db["films"]

In [24]:
user_info_table.insert_many(sci_fi_df_dict)


<pymongo.results.InsertManyResult at 0x1eba32bd280>

In [27]:
client.list_database_names()

['admin', 'config', 'local', 'sci_fi_df']

In [28]:
DBName='sci_fi_df'
client[DBName].list_collection_names()

['films']