## Understand Web Scraping 

### BeautifulSoup and Requests

In [151]:
from bs4 import BeautifulSoup
import requests
# url = 'https://www.scrapethissite.com/pages/forms/'
url = 'https://webscraper.io/test-sites/e-commerce/allinone/phones'

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
# print(soup.prettify())


In [None]:
print(soup.title)
print(soup.title.string)
print(soup.title.parent.name)

In [None]:
print(soup.find("h4", {"class": "price float-end card-title pull-right"}))
print(soup.find("h4", class_="price float-end card-title pull-right"))

all_titles = soup.find_all("a", {"class": "title"})
print([titles.text for titles in all_titles])

all_prices = soup.find_all("h4", {"class": "price float-end card-title pull-right"})
print([prices.text for prices in all_prices])

In [None]:
import pandas as pd

# pd.DataFrame(all_prices, all_titles, columns=["Devices"])

data = {"Devices": all_prices, "Titles": all_titles}
df = pd.DataFrame(data)
df

### Scraping Data from Table

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [4]:
url = 'https://www.worldometers.info/world-population/population-by-country/'

page = requests.get(url)
soup = BeautifulSoup(page.text, "html")
# print(soup.prettify())

In [5]:
table = soup.find("table", class_="table table-striped table-bordered")
world_columns = table.find_all("th")
columns_titles = [title.text for title in world_columns]
print(columns_titles)

['#', 'Country (or dependency)', 'Population (2024)', 'Yearly Change', 'Net Change', 'Density (P/Km²)', 'Land Area (Km²)', 'Migrants (net)', 'Fert. Rate', 'Med. Age', 'Urban Pop %', 'World Share']


In [7]:
df = pd.DataFrame(columns=columns_titles)
df

Unnamed: 0,#,Country (or dependency),Population (2024),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share


In [8]:
world_rows = table.find_all('tr')
# skipping the first row which is the header
for row in world_rows[1:]:
    row_data = row.find_all('td')
    # print(row_data)
    individual_row_data = [data.text for data in row_data]
    # print(individual_row_data)

    length = len(df)
    df.loc[length] = individual_row_data
df

Unnamed: 0,#,Country (or dependency),Population (2024),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,India,1450935791,0.89 %,12866195,488,2973190,-630830,2.0,28,37 %,17.78 %
1,2,China,1419321278,-0.23 %,-3263655,151,9388211,-318992,1.0,40,66 %,17.39 %
2,3,United States,345426571,0.57 %,1949236,38,9147420,1286132,1.6,38,82 %,4.23 %
3,4,Indonesia,283487931,0.82 %,2297864,156,1811570,-38469,2.1,30,59 %,3.47 %
4,5,Pakistan,251269164,1.52 %,3764669,326,770880,-1401173,3.5,20,34 %,3.08 %
...,...,...,...,...,...,...,...,...,...,...,...,...
229,230,Montserrat,4389,-0.70 %,-31,44,100,-7,1.4,42,11 %,0.00 %
230,231,Falkland Islands,3470,-0.20 %,-7,0,12170,-13,1.7,42,68 %,0.00 %
231,232,Tokelau,2506,4.55 %,109,251,10,72,2.6,27,0 %,0.00 %
232,233,Niue,1819,0.11 %,2,7,260,10,2.5,36,44 %,0.00 %


In [9]:
df.to_csv("/Users/koapam/Desktop/Coding_Workspace/Python_Workspace/example/webscraping/world_population.csv", index=False)

## Re Module & RegEx Methods

### Understand basic syntax

[Link to Regex Cheatsheet](https://www.rexegg.com/regex-quickstart.php)

In [None]:
import re
quote = "There's only one thing, I hate more than lying: skim milk. Which is water, that's lying about being milk. - Ron Swanson"

print(re.search("milk",quote).group())
print(re.findall("milk",quote))
print(len(re.findall("milk",quote)))
print(re.split("\\,",quote))
print(re.split("\\.", quote, maxsplit=1))
print(re.sub("milk", "dairy", quote, count = 1))

In [None]:
string = 'I like the mountains in the spring. 234098'
re.findall('[a-hA-H0-9]',string)

In [None]:
string = 'I have 123,456 koalas! And 12 dogs!'
re.findall("\\!", string)
re.findall('[0-4]', string)

In [None]:
string = 'I hate that I love lovely balloon animals. They are beautiful and lovely.'
re.findall('lovely|beautiful', string)

In [None]:
string = 'You can see sea shells by the sea shore. sba'
re.findall('s.e', string)
re.findall("s.{1}a", string)

In [None]:
string = "Well well well... if it isn't Will Wilmer"
re.findall('W.{2}l', string)

In [None]:
string = 'Happy birthday to you. Happy birthday to you. Happy birthday dear Alex, happy birthday to you.'
re.findall('^Hap', string)
re.findall('you.$', string)

In [None]:
string = 'This Thing called a Thimble ha Thrice hurt me'
re.findall('Thi.*e', string)
re.findall("Thi.+e", string)
re.findall("Thi.{3}?e", string)

In [None]:
text = "The event is on November 15th, 2023."
re.findall(r"\d+", text)

In [None]:
quote = 'My name is Neo. My phone number is 534-342-1234. My email is TheMatrixMan@gmail.com'

re.findall('[A-Za-z]$',quote)

re.findall('[0-4]{3}',quote)
re.findall('\\d{3}',quote)

re.findall('\\D',quote)
re.findall("\\W", quote)
re.findall("\\S", quote)

In [None]:
quote = 'I love the Matrix is MatrixLove a word?'
re.findall(r'\bMatrix',quote)
re.findall(r'\Batrix',quote)

In [None]:
string = "It's a BEAUTIFUL day in the city. It's 75 degrees and sunny!"
result = re.search(r"(\b[A-Z]+\b).*(\b\d+\b)", string)
print(result)
result.span()
result.start()
result.end()

print(result.groups())
result.group()
result.group(2)

In [None]:
string = "My name is Mr. Khoa Pham. My phone number is 123-456-7890. My email is KhoaPham@gmail.com. "
re.findall(r"\b[a-z]+\b", string, re.I)
# re.findall(r"\b[a-zA-Z]+\b", string)


In [None]:
target_string = "I like ice cream. \nI like pie as well!"
target_string2 = """Alex likes the number 16
Christine likes the number 24
"""

re.findall(r"^\w{4}", target_string2)
re.findall(r"^\w{4}", target_string2, re.M)
re.findall(r"\d{2}$", target_string2, re.M)

result = re.search(r".+", target_string, re.S)
print(result.group())


### Regex Use Cases

In [None]:
import re

random_text = '''
My name is Mr. Neo. My phone number is 123-456-7890. My email is ChosenOne@gmail.com
My name is Mr. Morphius. My phone number is 413-234-2568. My email is Cool_Guy@yahoo.com.
My name is Mrs. Trinity. My phone number is 285-036-8215. My email is ChosenTwo-Girl2@apple-2.com.
'''


print(re.findall("@[a-z]+", random_text))
print(re.findall("@([a-z]+)", random_text))

# \\w for alphanumeric characters [a-zA-Z0-9_]
print(re.findall("@([\\w\\.]+)", random_text))
print(re.findall("[\\w]+@[\\w\\.]+", random_text))

In [None]:
print(re.findall("\\d{3}-\\d{3}-\\d{4}", random_text))

# [\w\.-]+: Matches one or more word characters (\w), dots (.), or hyphens (-) (for the local part of the email).
# @: Matches the @ symbol.
# \.\w+: Matches the dot (.) followed by one or more word characters (for the domain suffix, like .com).
email_list = re.findall("[\\w\\.-]+@[\\w\\.-]+\\.\\w+", random_text)
print(email_list)

domain_list = []
for email in email_list:
    # domains.append(re.findall("@([\\w\\.]+)", email))
    domain_list.append(re.findall("@([\\w\\.]+)", email)[0])
print(domain_list)

# domain_list2 = [re.findall("@([\\w\\.]+)", email)[0] for email in email_list]
# print(domain_list2)


In [None]:
phone_numbers = [
    "(123) 456-7890",
    "(555) 123-4567",
    "(987) 654-3210",
    "(800) 555-1212",
    "(123) 456-7890",
]

area_code_pattern = r"\((\d{3})\)"
area_codes = []

# Loop through each phone number and extract area codes:
for phone_number in phone_numbers:
    match = re.search(area_code_pattern, phone_number)
    if match:
        area_codes.append(match.group(1))

print(area_codes)


## Web Scraping + RegEx + Pandas

In [28]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [47]:
url = "http://www.analytictech.com/mb021/mlk.htm"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html")
# print(soup.prettify())


mlkj_speech = soup.find_all("p")
# print(type(mlkj_speech))

speech_combined = [p.text for p in mlkj_speech]
# print(type(speech_combined))
# print(speech_combined[: 10])
string_speech = ' '.join(speech_combined)
# print(string_speech[: 300])

string_speech_cleaned = string_speech.replace("\r\n", " ")
# print(string_speech_cleaned)

# speech_no_punt = re.sub("[^\\w\\s]", " ", string_speech_cleaned)
# no_punt_lower = speech_no_punt.lower()

speech_no_punt = re.sub("[^\\w\\s']", " ", string_speech_cleaned)  # Keep apostrophes
no_punt_lower = speech_no_punt.lower()

# print(speech_no_punt)
# print(no_punt_lower)
words_broken = re.split("\\s+", no_punt_lower)
print(type(words_broken))
print(len(words_broken))

<class 'list'>
883


In [52]:
# Avoid using this way because you'll need to use reset_index() to get the words and counts in separate columns
df = pd.DataFrame(words_broken).value_counts()
df.to_csv("/Users/koapam/Desktop/Coding_Workspace/Python_Workspace/example/webscraping/mlkj_speech1.csv", header = ["Counts"], index_label="Words")


df = df.reset_index()
df.columns = ["Words", "Count"]
# Once you reset, notice there's no (s) in 'Count'
# Now you can filter
df_filtered = df[df["Count"] > 25]
print(df_filtered)

  Words  Count
0   the     54
1    of     49
2    to     29
3   and     27


In [50]:
# Convert the list to a pandas Series
words_series = pd.Series(words_broken)
word_counts = words_series.value_counts()

# Convert to DataFrame with two columns: "words" and "counts"
df = pd.DataFrame({"Words": word_counts.index, "Counts": word_counts.values})

df.to_csv("/Users/koapam/Desktop/Coding_Workspace/Python_Workspace/example/webscraping/mlkj_speech2.csv", index=False)

df_filtered = df[df["Counts"] > 25]
print(df_filtered)


  Words  Counts
0   the      54
1    of      49
2    to      29
3   and      27


## Automated Web Scraper

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import os
import time

In [None]:
def automated_crypto_pull():
    url = 'https://coinmarketcap.com/currencies/bitcoin/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html")
    # print(soup.prettify())


    crypto_name = soup.find("span", {"data-role": "coin-name"})["title"]
    # print(crypto_name)
    cp = soup.find("span", class_="sc-65e7f566-0 WXGwg base-text").text
    crypto_price = cp.replace("$", "")
    # print(crypto_price)


    date_time = datetime.now()
    # print(date_time)

    dict = {"Crypto Name": crypto_name, "Crypto Price": crypto_price, "TimeStamp": date_time}
    df = pd.DataFrame([dict])


    if os.path.exists("/Users/koapam/Desktop/Coding_Workspace/Python_Workspace/example/webscraping/crypto_price.csv"):
        print("File exists")
        df.to_csv("/Users/koapam/Desktop/Coding_Workspace/Python_Workspace/example/webscraping/crypto_price.csv", mode='a', header=False, index=False)
        print(df)
    else:
        df.to_csv("/Users/koapam/Desktop/Coding_Workspace/Python_Workspace/example/webscraping/crypto_price.csv", index=False)


while True:
    automated_crypto_pull()
    time.sleep(10)

File exists
  Crypto Name Crypto Price                  TimeStamp
0     Bitcoin    66,368.73 2024-10-23 14:56:32.293498
File exists
  Crypto Name Crypto Price                  TimeStamp
0     Bitcoin    66,368.73 2024-10-23 14:56:42.459971
File exists
  Crypto Name Crypto Price                  TimeStamp
0     Bitcoin    66,368.73 2024-10-23 14:56:52.644207
File exists
  Crypto Name Crypto Price                  TimeStamp
0     Bitcoin    66,368.73 2024-10-23 14:57:02.835004
File exists
  Crypto Name Crypto Price                  TimeStamp
0     Bitcoin    66,375.43 2024-10-23 14:57:12.957403


KeyboardInterrupt: 