# Web Scraping the IMDB Website

In [2]:
#pip install selenium
#pip install beautifulsoup4

In [1]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from time import sleep
from random import randint

In [2]:
# To get English-translated titles from all the movies:
headers = {"Accept-Language": "en-US, en;q=0.5"} 

In [34]:
# Request the page where the list of movies are present
page = requests.get("https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm", headers=headers)

In [145]:
soup = BeautifulSoup(page.text, 'html.parser')
movie_div = soup.find_all('td', class_="titleColumn")  # extract all movie containers
#movie_div

In [146]:
# Extract all movie ids which will be used to access each movie's reviews
movie_ids =[]
for cont in movie_div:
    movid = cont.a['href'].split('/')[2]
    movie_ids.append(movid)
#movie_ids

In [147]:
# Extract all movie names which will be used to group reviews
movie_names =[]
for cont in movie_div:
    movid = cont.a.text
    movie_names.append(movid)
#movie_names

In [126]:
#Create table where all movie reviews will be stored       
user_reviews = pd.DataFrame(columns=['name','year','reviews'])

# Start extracting the movies' reviews
for i in range(0, len(movie_ids)-1):
    
    # Request page where the movie's reviews are present
    page = requests.get("https://www.imdb.com/title/" + movie_ids[i] + "/reviews?ref_=tt_ov_rt", headers = headers) 
    
    soup = BeautifulSoup(page.text, 'html.parser')
    movie_div = soup.find_all('div', class_='lister-item-content')  # extract all movie review containers
    
    # Controlling the crawl rate to avoid disrupting the activity of website
    sleep(randint(2,10)) 
    
    names=[]
    years = []
    reviews = []
    
    #Extract reviews and dates on which the reviews were given by users
    for container in movie_div:
        rev = container.find('div', class_='text show-more__control').text if container.find('div', class_='text show-more__control') else '-'
        reviews.append(rev)
    for container in movie_div:
        date = container.find('span', class_='review-date').text if container.find('span', class_='review-date') else '-'
        years.append(date)
        
    names = [movie_names[i]] * len(years)
    
    
    #building our Pandas dataframe         
    df = pd.DataFrame({
    'name': names,
    'year': years,
    'reviews': reviews
    })
    
    user_reviews = user_reviews.append(df, ignore_index = True)
     

In [127]:
user_reviews

Unnamed: 0,name,year,reviews
0,The Old Guard,10 July 2020,"Love the concept, execution is not bad (there ..."
1,The Old Guard,10 July 2020,"There were just so many clichés, cringe, and n..."
2,The Old Guard,10 July 2020,The actors are all good. Well that's it. The s...
3,The Old Guard,10 July 2020,I didn't read the comic so I went in blind. I ...
4,The Old Guard,13 July 2020,"The beginning of movie is very interesting, bu..."
...,...,...,...
2154,The Wrong Missy,1 June 2020,"No words, it was horrible, not even laughed on..."
2155,The Wrong Missy,13 May 2020,It is a good movie to spend a happy time. Bett...
2156,The Wrong Missy,22 May 2020,"If you have brain cells, avoid this movie at a..."
2157,The Wrong Missy,16 May 2020,Then I'm Leslie Nielsen.I should've been disco...


In [128]:
# Export the table to a csv file
user_reviews.to_csv(r'C:\Users\Kriti.Biswas\Desktop\new projects\mined_reviews.csv')

In [76]:
# Import the table from csv to avoid mining the info again
#user_reviews= pd.read_csv (r'C:\Users\Kriti.Biswas\Desktop\new projects\mined_reviews.csv')

# Sentiment Analysis

In [87]:
import nltk
#nltk.download('vader_lexicon') # Stored in : C:\Users\Kriti.Biswas\AppData\Roaming\nltk_data...
#nltk.download('punkt')

In [88]:
# import the relevant modules from the NLTK library
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

In [89]:
# initialize VADER
sid = SentimentIntensityAnalyzer()

In [90]:
# initialize our 'english.pickle' function and give it a short name
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [130]:
df = user_reviews.copy()
# Use 3 quotes instead of double quotes for the text to be analysed. Otherwise, the string will end early due to the quotation marks within the text
df['reviews'] = df['reviews'].apply(lambda x: "'''" + str(x) + "'''")

In [131]:
# Group the reviews based on movie names
df1 = df.groupby('name')['reviews'].apply(list).reset_index(name='reviews')

In [132]:
df1

Unnamed: 0,name,reviews
0,1917,"['''Last night COL Ferry and I (COL Coldwell, ..."
1,365 Days,['''I honestly can't believe this is an actual...
2,Ad Astra,['''I generally like slow burn psychological d...
3,Amulet,['''The title says it all. The acting was good...
4,Aquaman,['''And it has nothing to do with the comic bo...
...,...,...
82,Trolls World Tour,['''Watched this high. Weird movie.... probabl...
83,Twilight,['''I think this movie suffered from lack of a...
84,Uncut Gems,['''That was INTENSE. Decided to watch this af...
85,Vivarium,['''Got a half decent idea for a short tv epis...


In [148]:
a=[]
v1=[]
v2=[]
v3=[]

# For each review in the list of reviews for a movie, find the sentiment score.
for review_list in df1['reviews']:

    pos_review =[]
    neg_review=[]
    neutral_review=[]
    
    for message_text in review_list:
        # The tokenize method breaks up the paragraph into a list of strings. 
        sentences = tokenizer.tokenize(message_text)
        positive=[]
        negative=[]
        neutral=[]
    
        # Iterating through the list of sentences and calculating polarity scores for each one.
        for sentence in sentences:
            #print(sentence)
            scores = sid.polarity_scores(sentence) # dictionary having the polarity scores
            #for key in sorted(scores):
                #print('{0}: {1}, '.format(key, scores[key]), end='') # To see the scores
            #print()       
            if scores['compound'] >= 0.05 : 
                positive.append(1)
            elif scores['compound'] <= - 0.05 : 
                negative.append(1)
            else:
                neutral.append(1)
                
        if sum(positive) > sum(neutral):
            if sum(positive) > sum(negative):
                pos_review.append(1)
        elif sum(negative) > sum(neutral):
            neg_review.append(1)
        else:
            neutral_review.append(1)
    
    v1.append(sum(pos_review))
    v2.append(sum(neg_review))
    v3.append(sum(neutral_review))
    
    if sum(pos_review) > sum(neutral_review):
        if sum(pos_review) > sum(neg_review):
             a.append("Movie is worth watching")
    elif sum(neg_review) > sum(neutral_review):
        a.append("Movie is a waste of time")
    else:
        a.append("Movie is okay")
      
  

In [149]:
df1['sentiment'] = a
df1['No. of positive reviews'] = v1
df1['No. of negative reviews'] = v2
df1['No. of neutral reviews'] = v3
df1

Unnamed: 0,name,reviews,sentiment,No. of positive reviews,No. of negative reviews,No. of neutral reviews
0,1917,"['''Last night COL Ferry and I (COL Coldwell, ...",Movie is worth watching,15,1,7
1,365 Days,['''I honestly can't believe this is an actual...,Movie is worth watching,13,2,7
2,Ad Astra,['''I generally like slow burn psychological d...,Movie is worth watching,9,6,5
3,Amulet,['''The title says it all. The acting was good...,Movie is worth watching,7,3,5
4,Aquaman,['''And it has nothing to do with the comic bo...,Movie is worth watching,9,4,8
...,...,...,...,...,...,...
82,Trolls World Tour,['''Watched this high. Weird movie.... probabl...,Movie is worth watching,11,2,10
83,Twilight,['''I think this movie suffered from lack of a...,Movie is worth watching,15,4,2
84,Uncut Gems,['''That was INTENSE. Decided to watch this af...,Movie is okay,6,5,11
85,Vivarium,['''Got a half decent idea for a short tv epis...,Movie is worth watching,11,1,8
