In [13]:
import os, sys, requests, itertools
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

module_path = os.path.abspath(os.path.join('./utils'))
print(module_path)

if module_path not in sys.path:
    sys.path.append(module_path)
    print(sys.path)
    
from utils import imdbUtils


/home/jade/repos/movie_recommender/utils


In [25]:
#!python -m pip install requests
#!python -m pip install numpy
#!python -m pip install beautifulsoup4
#!python -m pip install pandas
#!python -m pip install -q transformers
!python -m pip install tensorflow-gpu
#!python -m pip install torch

Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.8 MB)
[K     |████████████████████████████████| 511.8 MB 73 kB/s s eta 0:00:01     |████████████████▍               | 261.6 MB 35.1 MB/s eta 0:00:08
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.9.2


In [30]:
#!python -m pip uninstall tensorflow torch -y
!python -m pip install tensorflow torch

Collecting tensorflow
  Using cached tensorflow-2.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.8 MB)
Collecting torch
  Using cached torch-1.12.1-cp38-cp38-manylinux1_x86_64.whl (776.3 MB)
Installing collected packages: tensorflow, torch
Successfully installed tensorflow-2.9.2 torch-1.12.1


In [4]:
min_rating, max_rating = 4.0, 10.0
n_votes = 50000
genres = 'thriller'
title_count = 250

SEARCH_URL = f"https://www.imdb.com/search/title/?title_type=feature&user_rating={min_rating},{max_rating}&num_votes={n_votes},&genres={genres}&view=simple&sort=user_rating,desc&count={title_count}"

In [5]:
movies_soup = imdbUtils.getSoup(SEARCH_URL)

In [6]:
# find all a-tags with class:None
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]

There are a total of 250 movie titles
Displaying 10 titles


['/title/tt0468569/',
 '/title/tt1375666/',
 '/title/tt0114369/',
 '/title/tt0102926/',
 '/title/tt6751668/',
 '/title/tt0482571/',
 '/title/tt0407887/',
 '/title/tt0114814/',
 '/title/tt0110413/',
 '/title/tt0054215/']

In [7]:
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 250 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt0468569/reviews',
 'https://www.imdb.com/title/tt1375666/reviews',
 'https://www.imdb.com/title/tt0114369/reviews',
 'https://www.imdb.com/title/tt0102926/reviews',
 'https://www.imdb.com/title/tt6751668/reviews',
 'https://www.imdb.com/title/tt0482571/reviews',
 'https://www.imdb.com/title/tt0407887/reviews',
 'https://www.imdb.com/title/tt0114814/reviews',
 'https://www.imdb.com/title/tt0110413/reviews',
 'https://www.imdb.com/title/tt0054215/reviews']

In [8]:
# get a list of soup objects
movie_soups = [imdbUtils.getSoup(link) for link in movie_links]

# get all 500 movie review links
movie_review_list = [imdbUtils.getReviews(movie_soup) for movie_soup in movie_soups]

movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

500
There are a total of 500 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw1917099/',
 'https://www.imdb.com/review/rw5478826/',
 'https://www.imdb.com/review/rw2286063/',
 'https://www.imdb.com/review/rw4692192/',
 'https://www.imdb.com/review/rw3399062/',
 'https://www.imdb.com/review/rw2251710/',
 'https://www.imdb.com/review/rw0299491/',
 'https://www.imdb.com/review/rw1198894/',
 'https://www.imdb.com/review/rw5965638/',
 'https://www.imdb.com/review/rw4978432/']

In [17]:
# get review text from the review link
review_texts = [imdbUtils.getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [imdbUtils.getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive
review_sentiment = np.array(['negative', 'positive'] * (len(movie_review_list)//2))

KeyboardInterrupt: 

In [15]:
# construct a dataframe
df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': review_texts, 'sentiment': review_sentiment})

In [16]:
df.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
0,The Dark Knight,https://www.imdb.com/review/rw1917099/,"Dark, yes, complex, ambitious. Christopher Nol...",negative
1,The Dark Knight,https://www.imdb.com/review/rw5478826/,"Confidently directed, dark, brooding, and pack...",positive
2,Inception,https://www.imdb.com/review/rw2286063/,I have to say to make such an impressive trail...,negative
3,Inception,https://www.imdb.com/review/rw4692192/,"My 3rd time watching this movie! Yet, it still...",positive
4,Seven,https://www.imdb.com/review/rw3399062/,"Bleak, gruesome, brilliant. These are a few of...",negative


In [39]:
df.to_csv('test_set.csv', index=False)

In [33]:
!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

2022-09-04 00:07:49.963220: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-09-04 00:07:50.031033: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-04 00:07:50.031059: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading pytorch_model.bin: 100%|███████

In [40]:
def reduce_size(sentence):
    if len(sentence) > 512:
        return sentence[:512]

In [42]:
# Sentiment Analyser Model: https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you
## Not working in either Jupyter notebook or Visual Studio. See if we can find a fix. Works outside.
from sentiment import SentimentAnalyser
sent_anayser = SentimentAnalyser()
df['sentiment'] = df['user_review'].apply(sent_anayser)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

In [None]:
print(data)