# Web Scraping with [Python](https://www.python.org/) using [`BeautifulSoup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and [`requests`](https://2.python-requests.org/en/master/)

The task is to scarpe user reviews of a Beiersdorf product and analyse its content by generating a wordcloud.

__Set up__

In [None]:
%load_ext autoreload
%autoreload 2

__Importieren von Python Bibliotheken__

In [None]:
import sys
from bs4 import BeautifulSoup
import requests

**Add path to look for modules**

In [None]:
import sys
sys.path.append("../src")

In [None]:
import helper_functions as hf

## Multiple (sub)websites

In [None]:
url = 'https://www.beautyheaven.com.au/mum-baby/baby-skin/710-nivea-creme-nivea-creme'

In [None]:
??hf.generate_urls

In [None]:
urls = hf.generate_urls(url=url, pages=(1,15))
urls

## Fetching the content of several websites using `requests` and  `BeautifulSoup`

In [None]:
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64;     x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate",     "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
reviews = ''
for url in urls:
    print(f'Processing url {url}')
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    text = hf.extract_text_from_soup(soup)
    reviews = reviews + " " + text
    print(f'There are {len(text.split())} words extracted.\n')

In [None]:
#soup

## Generating a word cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(reviews)


# Display the generated image:
fig, ax = plt.subplots(figsize=(12,6))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off");

### Playing around with wordclouds

In [None]:
#?WordCloud

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white",
                      max_words=60,
                      mode="RGB",
                      relative_scaling=0.5,
                      random_state=42
                    ).generate(reviews)


# Display the generated image:
fig, ax = plt.subplots(figsize=(12,6))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off");

#### Using masks

In [None]:
from PIL import Image
import numpy as np
mask = np.array(Image.open("../data/images/world.png"))   #choose mask

# Create and generate a word cloud image:
wordcloud = WordCloud(
    background_color="white",
                    mask=mask,
                    max_words=150,
                    width=800,
                    height=400,
                    mode="RGB",
                    random_state=42
                    ).generate(reviews)


# Display the generated image:
fig, ax = plt.subplots(figsize=(12,10))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off");

In [None]:
mask = np.array(Image.open("../data/images/Hamburg_Umriss.png"))   #choose mask

# Create and generate a word cloud image:
wordcloud = WordCloud(
    background_color="white",
                    mask=mask,
                    max_words=100,
                    width=800,
                    height=400,
                    mode="RGB",
                    random_state=42
                    ).generate(reviews)


# Display the generated image:
fig, ax = plt.subplots(figsize=(12,10))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off");

***