# Webscrape

### Import Packages

In [1]:
from random import randint
from time import sleep
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from datetime import datetime
import cloudscraper

## Scraping Script

Scraping script below to be updated with appropriate first and large page paramters ahead of each scrape attempt. Script will do the following:
- iterate through each page url with the page numbers between the first and last page parameters(int)
- sleep for a random period of 2, 3 or 4 seconds. This step was introduced as good practise to avoid any issue that may arise from excessive page requests from a single url (ie captcha).
- parse page html text
- iterate through each of the page's 25 comments, grabbing the target html objects:
    - date 
    - title 
    - comments 
    - content info (containing the user name, price and opion)
- append each of the 25 comments attributes to the lists initialised at the start

In [5]:
url = 'url'

In [4]:
# returns a CloudScraper instance
scraper = cloudscraper.create_scraper()

In [14]:
date = []
title = []
comment = []
user = []
price = []
opinion = []
last_page = 400
first_page = 1

for page in list(range(first_page, last_page)):
    sleep(randint(3,5))
    r = scraper.get(url.format(page))
    pagesoup = BeautifulSoup(r.text, 'html.parser')
    dates = [i.text for i in pagesoup.find_all(class_='share-chat-message__status-bar-time')]
    titles = [i.text for i in pagesoup.find_all(class_='share-chat-message__status-bar')]
    comments = [i.text for i in pagesoup.find_all(class_='share-chat-message__message-text')]
    content_info = [i.text for i in pagesoup.find_all(class_='share-chat-message__content-info-box')]
    
    for d in dates:
        date.append(d)
    for t in titles:
        title.append(t)
    for c in comments:
        comment.append(c)    
    
    for i in range(1, len(content_info), 3):
        user.append(content_info[i-1])
        price.append(content_info[i])
        opinion.append(content_info[i+1])
    
    # if website presents captcha then end scrape and print message
    
    if pagesoup.find('title').text == 'Please Wait... | Cloudflare':
        print('Captcha page indentified - datascrape ended')
        print('Failed on page:', page)
        print('Scraped Comments:', len(date))
        break

print('Scrape finished')
print('Last page scraped:', page)
print('Scraped comments:', len(date))

Scrape finished
Last page scraped: 399
Scraped comments: 9975


In [25]:
# incase scrape script fails, see what has been scraped

print(len(date))
print(len(title))
print(len(comment))
print(len(user))
print(len(price))
print(len(opinion))
print(len(date)/25)

9975
9975
9975
9975
9975
9975
399.0


In [15]:
# create dataframe of lists generated in scrape script

df = pd.DataFrame({'date':date, 
                   'user':user, 
                   'title':title, 
                   'comment':comment, 
                   'opinion':opinion, 
                   'price':price})

In [16]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price
0,Today 22:43,Dillon2019,RE: Any use to us folks here?Today 22:43,This was on the BBC yesterday but I didnât c...,Strong Buy,Price: 13.10
1,Today 22:41,JiffyBag,RE: Any use to us folks here?Today 22:41,Russiaâs Economy in Freefallhttps://www.yout...,No Opinion,Price: 13.10
2,Today 22:15,JackoDLad,Any use to us folks here?Today 22:15,Found the following on another board (EUA) and...,No Opinion,Price: 13.10
3,Today 20:00,Tymers,RE: Naughty boysToday 20:00,"Speedy, like Hydro at some point will be corre...",Strong Buy,Price: 13.10
4,Today 19:52,Tymers,RE: ChoicesToday 19:52,Choices is correct! Yet all comments are ones ...,Strong Buy,Price: 13.10


## Minor Cleaning

In [27]:
# drop previous, uncleaned title column
df.drop(columns='Unnamed: 0', inplace=True)

In [28]:
# clean title column, removing the corresponding date info from the title
df['title_clean'] = [df.loc[i,'title'].replace(df.loc[i,'date'],'') for i in range(0,len(df))]

In [29]:
# clean price column
df['price'] = df.price.str.replace('Price:  ', '').astype(float)

In [30]:
# drop previous, uncleaned title column
df.drop(columns='title', inplace=True)

In [31]:
# reshuffle columns to original layout
df = df[['date', 'user', 'title_clean', 'comment', 'opinion', 'price']]

In [32]:
# rename title_clean column back to title
df.columns = ['date', 'user', 'title', 'comment', 'opinion', 'price']

In [33]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price
0,15 Dec '20,DGR1980,Good timing for it,Stroke of genius announcing the change in the ...,No Opinion,32.125
1,15 Dec '20,Quepensa,RE: GH,"The King is dead, long live the King.We March ...",No Opinion,32.125
2,15 Dec '20,EnricoPallazzo,RE: Gervaise,"How about that?GGP: Gervaise out, Shaun inNCM:...",Strong Buy,32.125
3,15 Dec '20,Jambo813,February,If GH is not allowed to retain his options the...,No Opinion,32.125
4,15 Dec '20,EnricoPallazzo,RE: Gervaise,"We'll miss you, Gervaise! Can you let us know ...",Strong Buy,31.875


## Save cleaned scrape to csv

insert scrape date into csv name

In [11]:
today = datetime.today().strftime('%d-%b-%y')
today

'30-Mar-22'

In [24]:
# save to csv
# df.to_csv('../../../GA/Capstone/scrapes/scrape_{}.csv'.format(today), index=False)