In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
from requests import get
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

# Scraping of the HTML elements

In [None]:
# The URL I want to scrap data on
url = 'https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html'

# Prepare GET request
response = requests.get(url)

# Retrieve the webpage and store it as an bs4.BeautifulSoup object
html_soup = bs4.BeautifulSoup(response.text, 'html.parser')

In [None]:
quotes = html_soup.find_all('p', class_ = 'phrase-list')
size = len(quotes)
print(size)
quotes[:5]

2390


[<p class="phrase-list"><a href="a-bird-in-the-hand.html">A bird in the hand is worth two in the bush</a></p>,
 <p class="phrase-list"><a href="70700.html">A bolt from the blue</a></p>,
 <p class="phrase-list"><a href="a-bunch-of-fives.html">A bunch of fives</a></p>,
 <p class="phrase-list"><a href="the-weakest-link.html">A chain is only as strong as its weakest link</a></p>,
 <p class="phrase-list"><a href="a-change-is-as-good-as-a-rest.html">A change is as good as a rest</a></p>]

In [None]:
# Populate an array with cleaned textual quotes
cleaned_quotes = [quotes[i].text for i in range(size)]
print(cleaned_quotes[:5])
len(cleaned_quotes)

['A bird in the hand is worth two in the bush', 'A bolt from the blue', 'A bunch of fives', 'A chain is only as strong as its weakest link', 'A change is as good as a rest']


2390

In [None]:
href_quotes = [quotes[i].a['href'] for i in range(size)]

In [None]:
# The base link
BASE_LINK = 'https://www.phrases.org.uk/meanings/'

def get_explanations(url):
    
    # This chunk of code is the same used in the begining of this notebook but for the expanations
    url = url
    response = get(BASE_LINK + url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    quote_explanation = html_soup.find_all('p', class_ = 'meanings-body')
    if len(quote_explanation) >= 1:
        quote_explanation = str(quote_explanation[0].text)
    else:
        quote_explanation = "NO INFORMATION"
        
    return quote_explanation

In [None]:
%%time
# Populate an array with explanations
number_of_quotes = size
assert number_of_quotes < len(quotes) + 1

explanations = [get_explanations(i) for i in tqdm(href_quotes[:number_of_quotes])]

100%|██████████| 2390/2390 [06:18<00:00,  6.32it/s]

CPU times: user 1min 47s, sys: 3.27 s, total: 1min 50s
Wall time: 6min 18s





# Data Integration

In [None]:
# Create the proverbs and sayings dataset
quotes_dataframe = pd.DataFrame()
quotes_dataframe['text'] = quotes[:number_of_quotes]
quotes_dataframe['text'] = quotes_dataframe['text'].apply(lambda x:x.text)
quotes_dataframe['explanation'] = explanations
quotes_dataframe = quotes_dataframe[['explanation', 'text']]
quotes_dataframe.head()

Unnamed: 0,explanation,text
0,The proverb 'A bird in the hand is worth two i...,A bird in the hand is worth two in the bush
1,"A\ncomplete surprise, like\na bolt of lightnin...",A bolt from the blue
2,'A bunch of fives' is a slang term for\n ...,A bunch of fives
3,The proverb 'A chain is only as strong as its ...,A chain is only as strong as its weakest link
4,A change is as good as a rest is a proverb th...,A change is as good as a rest


In [None]:
quotes_dataframe.to_csv('/content/drive/MyDrive/SloganGenerator/dataset/sayings.csv', index=False)

In [None]:
sayings_dataset = pd.read_csv('/content/drive/MyDrive/SloganGenerator/dataset/sayings.csv')
slogan_dataset = pd.read_csv('/content/drive/MyDrive/SloganGenerator/dataset/slogans.csv')

In [None]:
print(len(sayings_dataset))
print(len(slogan_dataset))

2390
9519


In [None]:
slogan_dataset.head()

Unnamed: 0,company,slogan
0,"Eggland’s Best, farm fresh eggs",Better taste. Better nutrition. Better eggs.
1,"Eggland’s Best, farm fresh eggs",It's EB.
2,"Egg Beaters products, egg whites separated fro...",The real thing. Only better.
3,"Egg Beaters products, egg whites separated fro...",Egg Beaters. Unbeatable.
4,"Egg Beaters products, egg whites separated fro...",The egg perfected.


In [None]:
sayings_dataset.head()

Unnamed: 0,explanation,text
0,The proverb 'A bird in the hand is worth two i...,A bird in the hand is worth two in the bush
1,"A\ncomplete surprise, like\na bolt of lightnin...",A bolt from the blue
2,'A bunch of fives' is a slang term for\n ...,A bunch of fives
3,The proverb 'A chain is only as strong as its ...,A chain is only as strong as its weakest link
4,A change is as good as a rest is a proverb th...,A change is as good as a rest


In [None]:
sayings_dataset.rename(columns={"explanation": "company", "text": "slogan"}, inplace=True)
sayings_dataset.head()

Unnamed: 0,company,slogan
0,The proverb 'A bird in the hand is worth two i...,A bird in the hand is worth two in the bush
1,"A\ncomplete surprise, like\na bolt of lightnin...",A bolt from the blue
2,'A bunch of fives' is a slang term for\n ...,A bunch of fives
3,The proverb 'A chain is only as strong as its ...,A chain is only as strong as its weakest link
4,A change is as good as a rest is a proverb th...,A change is as good as a rest


In [None]:
frames = [slogan_dataset, sayings_dataset]

result = pd.concat(frames)

print(len(result))
result.head()

11909


Unnamed: 0,company,slogan
0,"Eggland’s Best, farm fresh eggs",Better taste. Better nutrition. Better eggs.
1,"Eggland’s Best, farm fresh eggs",It's EB.
2,"Egg Beaters products, egg whites separated fro...",The real thing. Only better.
3,"Egg Beaters products, egg whites separated fro...",Egg Beaters. Unbeatable.
4,"Egg Beaters products, egg whites separated fro...",The egg perfected.


In [None]:
result = result.sample(frac=1).reset_index(drop=True)
print(len(result))
result.head()

11909


Unnamed: 0,company,slogan
0,Macungie Animal Hospital,"Come, sit, heal."
1,Aventis Pharma in India,Our challenge is life.
2,"Melrose Cheestrings, cheese snack for kids",Real cheese. Real fun.
3,"V8, vegetable juice brand",It Might Just Make You Feel Better
4,Canada Factoring,True funding solutions.


In [None]:
result.to_csv('/content/drive/MyDrive/SloganGenerator/dataset/merged.csv', index=False)