# **This notebook is breakdown of required web scraper and analysis code used in project.**


> Scrapy is used to crawl web page data.

In [None]:
!pip install crochet scrapy

In [None]:
import crochet
crochet.setup()     # initialize crochet

#All imports used in project
import json
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
#Intilazations
crawl_runner = CrawlerRunner()      # requires the Twisted reactor to run                  # store quotes

### **Spider to extract basic course details**

In [None]:
urls=['https://www.coursera.org/learn/introduction-to-data-analytics']
global final_data
final_data = {}

In [None]:
class MySpider2(scrapy.Spider):
    name="course_details"
    def start_requests(self):
      for url in urls:
          yield scrapy.Request(url,callback=self.parse,meta={'dont_merge_cookies': True})
            
    def parse(self,response):
        course_details={}
        name= response.css('h1.banner-title::text').get()
        instructor_rating= response.css('span.avg-instructor-rating__total span::text').get()
        skills=response.css('div.Skills span::text').getall()
        rating_div=response.css('div.XDPRating span::text').getall()
        content_rating=rating_div[0]
        no_of_ratings=rating_div[1]
        domains=response.css('a.color-white.font-weight-bold::text').getall()
        domain=domains[1:]
        no_of_reviews=response.xpath('.//div/div/div/div/span/strong/span/text()').getall()[0]
        time= response.css('div._16ni8zai.m-b-0.m-t-1s span::text').get()
        org=response.css('h3.headline-4-text.bold.rc-Partner__title::text').get()
        instructor=response.css('h3.instructor-name.headline-3-text.bold::text').get()
        course_details['name']=name
        course_details['instructor_rating']=instructor_rating
        course_details['no_of_reviews']=no_of_reviews.split(" ")[0]
        course_details['skills']=skills
        course_details['content_rating']=content_rating
        course_details['no_of_ratings']=no_of_ratings.split(" ")[0]
        course_details['domain']=domain
        course_details['time']=time.split(" ")[1]
        course_details['org']=org
        final_data[name]=course_details


In [None]:
@crochet.run_in_reactor
def scrape_with_crochet(spider):
    eventual=crawl_runner.crawl(spider)
    eventual.addCallback(finished_scrape)

In [None]:
def finished_scrape(final_data):
  print(final_data)

In [None]:
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
scrape_with_crochet(MySpider2)

INFO:scrapy.crawler:Overridden settings:
{}
INFO: Overridden settings:
{}


<crochet._eventloop.EventualResult at 0x7fb32b0cae10>

In [None]:
final_data

{}

INFO:scrapy.extensions.telnet:Telnet Password: 8850aca59d03a31d
2022-05-20 20:32:13 [scrapy.extensions.telnet] INFO: Telnet Password: 8850aca59d03a31d
INFO:scrapy.middleware:Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2022-05-20 20:32:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']


### **Spider to extract course reviews**


> Will store it in a dataframe for this demostration



In [None]:
# review_urls=['https://www.coursera.org/learn/introduction-to-data-analytics/reviews']
review_urls=['https://www.coursera.org/learn/pythonforcybersecurity-introduction/reviews']

In [None]:
import pandas as pd
global df
df = pd.DataFrame(columns=['review','star','upvote'])
df

Unnamed: 0,review,star,upvote


In [None]:
global res
res={}

In [None]:
class MySpider1(scrapy.Spider):
    name="course_reviews"
    def start_requests(self):
        for url in review_urls:
            yield scrapy.Request(url,callback=self.parse,meta={'dont_merge_cookies': True})
            
    def parse(self,response):
        review=response.css('div.rc-ReviewsList')
        reviewBlock=review.css('div.review.review-text.review-page-review')
        for ix in reviewBlock:
            review=ix.css('div.reviewText p::text').get()
            stars=ix.css('svg::attr(style)').getall()
            star=len([i for i in stars if i[6:12]=='F2D049'])
            help_button=ix.css('button.review-helpful-button')
            upvote=help_button.css('span::text').get().split(" ")[-1]
            global df
            df=df.append({'review':review,'star':star,'upvote':upvote},ignore_index=True)
        next_page=response.css('ul.cui-buttonList a::attr(href)').getall()[-1]
        if next_page is not None:
          next_page=response.urljoin(next_page)
          yield scrapy.Request(next_page,callback=self.parse)


 ### Wait for few minutes or check res status to see which page number is being crawled

In [None]:
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
scrape_with_crochet(MySpider1)

INFO:scrapy.crawler:Overridden settings:
{}
INFO: Overridden settings:
{}
INFO:scrapy.extensions.telnet:Telnet Password: 716ac697e9f865b0
2022-05-20 20:32:20 [scrapy.extensions.telnet] INFO: Telnet Password: 716ac697e9f865b0


<crochet._eventloop.EventualResult at 0x7fb163c74150>

INFO:scrapy.middleware:Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2022-05-20 20:32:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
INFO:scrapy.middleware:Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.red

In [None]:
df

INFO:scrapy.middleware:Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-05-20 20:32:20 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
INFO:scrapy.middleware:Enabled item pipelines:
[]
2022-05-20 20:32:20 [scrapy.middleware] INFO: Enabled item pipelines:
[]
INFO:scrapy.core.engine:Spider opened


Unnamed: 0,review,star,upvote


2022-05-20 20:32:20 [scrapy.core.engine] INFO: Spider opened
INFO:scrapy.extensions.logstats:Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-05-20 20:32:20 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
INFO:scrapy.extensions.telnet:Telnet console listening on 127.0.0.1:6023
2022-05-20 20:32:20 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023


### **Using BERT for sentimental analysis**



> After reading few research paper and various articles I found this one to be most helpful and with good explanation for beginners.


>[Article Link](https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671)





In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [None]:
import tensorflow as tf

In [None]:
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

['neg', 'urls_pos.txt', 'labeledBow.feat', 'urls_unsup.txt', 'urls_neg.txt', 'unsupBow.feat', 'pos']


In [None]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=123)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
import pandas as pd

In [None]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I loved this movie. It is rare to get a glimps...,1
1,How much could the general Hollywood director ...,1
2,This is a film which should be seen by anybody...,1
3,A mock documentary about a pair of Canadian pr...,1
4,This movie changed it all for me...I heard of ...,1


In [None]:
for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I can't believe that so much talent can be was...,0
1,This movie blows - let's get that straight rig...,0
2,"The saddest thing about this ""tribute"" is that...",0
3,I'm only rating this film as a 3 out of pity b...,0
4,Something surprised me about this movie - it w...,1


In [None]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb3d620b2d0>

In [None]:
model.save('/content/drive/MyDrive/bert_model')



INFO:tensorflow:Assets written to: /content/drive/MyDrive/bert_model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/bert_model/assets


In [None]:
pred_sentences=df.review.to_list()
pred_sentences

In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

I will be super honest for this course, I had to give up, I was really wasting time in boring explanations, a course that is not challenging at all and they did not have imagination to create a lab environment or recommend something nice or challenge (There are really a lot of option out there) : 
 Negative
This introduction is really cool for beginners. Thanks! : 
 Positive
There is no hands-on and step-by-step instruction. It's a joke. : 
 Negative
Good course material. Code files should be provided in addition to the course material. : 
 Positive
Well i know this course gonna be this fun i have completed it early. I learn a lot and in truth i always want a course like this. : 
 Positive
Wanted to walk away from this with more code. : 
 Negative
Maybe the instructor is an incredible e-hacker, but it is a terrible communicator, so an awful teacher: an absolutely linear voice, without a body expression ... it is a zombie or a robot. The subject is somehow challenging, and that is why h