# Webscrape

### Import Packages

In [1]:
from random import randint
from time import sleep
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from datetime import datetime

## Scraping Script

Scraping script below to be updated with appropriate first and large page paramters ahead of each scrape attempt. Script will do the following:
- iterate through each page url with the page numbers between the first and last page parameters(int)
- sleep for a random period of 2, 3 or 4 seconds. This step was introduced as good practise to avoid any issue that may arise from excessive page requests from a single url (ie captcha).
- parse page html text
- iterate through each of the page's 25 comments, grabbing the target html objects:
    - date 
    - title 
    - comments 
    - content info (containing the user name, price and opion)
- append each of the 25 comments attributes to the lists initialised at the start

In [None]:
url = 'url'

In [None]:
date = []
title = []
comment = []
user = []
price = []
opinion = []
last_page = 10
first_page = 1

for page in list(range(first_page, last_page)):
    sleep(randint(2,4))
    r = requests.get(url.format(page))
    pagesoup = BeautifulSoup(r.text, 'html.parser')
    dates = [i.text for i in pagesoup.find_all(class_='share-chat-message__status-bar-time')]
    titles = [i.text for i in pagesoup.find_all(class_='share-chat-message__status-bar')]
    comments = [i.text for i in pagesoup.find_all(class_='share-chat-message__message-text')]
    content_info = [i.text for i in pagesoup.find_all(class_='share-chat-message__content-info-box')]
    
    for d in dates:
        date.append(d)
    for t in titles:
        title.append(t)
    for c in comments:
        comment.append(c)    
    
    for i in range(1, len(content_info), 3):
        user.append(content_info[i-1])
        price.append(content_info[i])
        opinion.append(content_info[i+1])
    
    # if website presents captcha then end scrape and print message
    
    if pagesoup.find('title').text == 'Please Wait... | Cloudflare':
        print('Captcha page indentified - datascrape ended')
        print('Failed on page:', page)
        print('Scraped Comments:', len(date))
    break

In [None]:
# incase scrape script fails, see what has been scraped

print(len(date))
print(len(title))
print(len(comment))
print(len(user))
print(len(price))
print(len(opinion))

In [None]:
# create dataframe of lists generated in scrape script

df = pd.DataFrame({'date':date, 
                   'user':user, 
                   'title':title, 
                   'comment':comment, 
                   'opinion':opinion, 
                   'price':price})

In [2]:
df = pd.read_csv('resources/scrape1.csv')

In [5]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price
0,Today 23:55,JiffyBag,RE: SASToday 23:55,"For me, it was just a fun exercise to do, and ...",No Opinion,Price: 13.80
1,Today 23:51,JiffyBag,RE: SASToday 23:51,"Hi Walkabout,I fully acknoweldege your rationa...",No Opinion,Price: 13.80
2,Today 23:46,rosso123,Newcrest pulling a fast oneToday 23:46,"NCM wants this 5% done and dusted asap, otherw...",No Opinion,Price: 13.80
3,Today 23:04,JiffyBag,RE: Why would u short thisToday 23:04,"Merc,It would be fantastic if someone could ac...",No Opinion,Price: 13.80
4,Today 22:53,Philbrim,Ups n downs.Today 22:53,We go down 5% and most people are so volatile....,No Opinion,Price: 13.80


## Minor Cleaning

In [6]:
# clean title column, removing the corresponding date info from the title
df['title_clean'] = [df.loc[i,'title'].replace(df.loc[i,'date'],'') for i in range(0,len(df))]

In [7]:
# clean price column
df['price'] = df.price.str.replace('Price:  ', '').astype(float)

In [8]:
# drop previous, uncleaned title column
df.drop(columns='title', inplace=True)

In [9]:
# reshuffle columns to original layout
df = df[['date', 'user', 'title_clean', 'comment', 'opinion', 'price']]

In [10]:
# rename title_clean column back to title
df.columns = ['date', 'user', 'title', 'comment', 'opinion', 'price']

In [11]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price
0,Today 23:55,JiffyBag,RE: SAS,"For me, it was just a fun exercise to do, and ...",No Opinion,13.8
1,Today 23:51,JiffyBag,RE: SAS,"Hi Walkabout,I fully acknoweldege your rationa...",No Opinion,13.8
2,Today 23:46,rosso123,Newcrest pulling a fast one,"NCM wants this 5% done and dusted asap, otherw...",No Opinion,13.8
3,Today 23:04,JiffyBag,RE: Why would u short this,"Merc,It would be fantastic if someone could ac...",No Opinion,13.8
4,Today 22:53,Philbrim,Ups n downs.,We go down 5% and most people are so volatile....,No Opinion,13.8


## Save cleaned scrape to csv

In [12]:
# save to csv
df.to_csv('resources/scrape1.csv', index=False)