# Webscrape

### Import Packages

In [None]:
from random import randint
from time import sleep
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from datetime import datetime

## Scraping Script

Scraping script below to be updated with appropriate first and large page paramters ahead of each scrape attempt. Script will do the following:
- iterate through each page url with the page numbers between the first and last page parameters(int)
- sleep for a random period of 2, 3 or 4 seconds. This step may not be neccessary, but was introduced as good practise to avoid any issue that may arise from excessive page requests from a single url.
- parse page html text
- iterate through each of the page's 25 comments, grabbing the target html objects:
    - date 
    - title 
    - comments 
    - content info (containing the user name, price and opion)
- append each of the 25 comments attributes to the lists initialised at the start
- once all pages between the first and last have been iterated through, each list will be aggregated to a single dataframe

In [None]:
url = 'url'

In [None]:
date = []
title = []
comment = []
user = []
price = []
opinion = []
last_page = 10
first_page = 1

for page in list(range(first_page, last_page)):
    sleep(randint(2,4))
    r = requests.get(url.format(page))
    pagesoup = BeautifulSoup(r.text, 'html.parser')
    dates = [i.text for i in pagesoup.find_all(class_='share-chat-message__status-bar-time')]
    titles = [i.text for i in pagesoup.find_all(class_='share-chat-message__status-bar')]
    comments = [i.text for i in pagesoup.find_all(class_='share-chat-message__message-text')]
    content_info = [i.text for i in pagesoup.find_all(class_='share-chat-message__content-info-box')]
    
    for d in dates:
        date.append(d)
    for t in titles:
        title.append(t)
    for c in comments:
        comment.append(c)    
    
    for i in range(1, len(content_info), 3):
        user.append(content_info[i-1])
        price.append(content_info[i])
        opinion.append(content_info[i+1])
        
df = pd.DataFrame({'date':date, 
                   'user':user, 
                   'title':title, 
                   'comment':comment, 
                   'opinion':opinion, 
                   'price':price})

In [None]:
# incase scrape script fails, see what has bene captured

print(len(date))
print(len(title))
print(len(comment))
print(len(user))
print(len(price))
print(len(opinion))

In [None]:
# incase scrape script fails, create dataframe of lists up until the script failed

df = pd.DataFrame({'date':date, 
                   'user':user, 
                   'title':title, 
                   'comment':comment, 
                   'opinion':opinion, 
                   'price':price})

In [None]:
df.head()

## Preliminary Cleaning

As the date column labels the current day (Today) and 5 previous days listed as the weekday only (the day of month is omitted), some pre-cleaning must be completed on csv scrapes that include the current dat and days prior to the current day, prior to concantenating to account for different scrape dates.

In [None]:
# clean title column, removing the corresponding date info from the title
df['title_clean'] = [df.loc[i,'title'].replace(df.loc[i,'date'],'') for i in range(0,len(df))]

In [None]:
# cleaning up date column - first need to replace previous 6 days data with dates relative to scrape date

Today = '22 Mar 2022'
Mon = '21 Mar 2022'
Sun = '20 Mar 2022'
Sat = '19 Mar 2022'
Fri = '18 Mar 2022'
Thu = '17 Mar 2022'

############################
## UPDATE FOR EACH SCRAPE ##
############################

In [None]:
# update date column with relative date for previous 6 days

df.loc[df['date'].str.contains('Today') ,'date'] = Today
df.loc[df['date'].str.contains('Mon') ,'date'] = Mon
df.loc[df['date'].str.contains('Sun') ,'date'] = Sun
df.loc[df['date'].str.contains('Sat') ,'date'] = Sat
df.loc[df['date'].str.contains('Fri') ,'date'] = Fri
df.loc[df['date'].str.contains('Thu') ,'date'] = Thu

############################
## UPDATE FOR EACH SCRAPE ##
############################

In [None]:
# clean price column
df['price'] = df.price.str.replace('Price:  ', '').astype(float)

In [None]:
# convert to datetime
df['datetime'] = pd.to_datetime(df.date)

In [None]:
# drop previous, uncleaned title column
df.drop(columns='title', inplace=True)

In [None]:
# reshuffle columns to original layout
df = df[['date', 'user', 'title_clean', 'comment', 'opinion', 'price']]

In [None]:
# rename title_clean column back to title
df.columns = ['date', 'user', 'title', 'comment', 'opinion', 'price']

## Save cleaned scrape to csv

In [None]:
# save to csv
# df.to_csv('resources/scrape1.csv', index=False)