# Python Tutorial: Easy Python Projects

By: jcchouinard.com

-----

## Simple Python Projects

1. Read and store CSVs 
2. Web Scraping
3. Parse text with Regular Expressions
4. Parse URLs
5. Find Entities in Content with NLP
6. Wikipedia and Reddit APIs
7. Read sitemap


`$ pip3 install pandas advertools requests bs4`


## Web Scraping

In [15]:
import requests
from bs4 import BeautifulSoup

# Fetch URL
r = requests.get('https://www.scrapethissite.com/pages/simple/')

# Parse HTML
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('title')
title

<title>Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping</title>

In [24]:
h3s = soup.find_all('h3')
results = []
for h3 in h3s:
    results.append(h3.text.strip())
results[:10]

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina']

In [17]:
def get_title(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    title = soup.find('title')
    print(title.text)

In [25]:
import requests
from bs4 import BeautifulSoup

def get_title(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    title = soup.find('title')
    print(title.text)
    return title.text

urls = [
    'https://www.scrapethissite.com/pages/ajax-javascript/',
    'https://www.scrapethissite.com/pages/forms/',
    'https://www.scrapethissite.com/pages/simple/'
]

titles = []
for url in urls:
    title = get_title(url)
    titles.append(title)

Oscar Winning Films: AJAX and Javascript | Scrape This Site | A public sandbox for learning web scraping
Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping


## Parse Text with Regular Expression

In [37]:
import re 

titles = [
    'Oscar Winning Films: AJAX and Javascript | Scrape This Site | A public sandbox for learning web scraping',
    'Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping',
    'Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping'
    ]

# Any alphanumeric, whitespace or comma
pattern = '(.*):([\w\s,]+)\|.*' 

for title in titles:
    extracted_text = re.search(pattern, title)
    print((
            extracted_text.group(1).strip(),
            extracted_text.group(2).strip()
        ))

('Oscar Winning Films', 'AJAX and Javascript')
('Hockey Teams', 'Forms, Searching and Pagination')
('Countries of the World', 'A Simple Example')


## Parse URLs

In [74]:
from urllib.parse import urlparse, parse_qs

# Example URL
url = "https://www.example.com/path/page?param1=value1&param2=value2#fragment"

# Parse the URL
parsed_url = urlparse(url)

# Extract components
scheme = parsed_url.scheme
host = parsed_url.netloc
path = parsed_url.path
query = parsed_url.query
fragment = parsed_url.fragment

# Parse query parameters
query_params = parse_qs(parsed_url.query)

# Display the results
print("Original URL:", url)
print("\nParsed Components:")
print("Scheme:", scheme)
print("Host:", host)
print("Path:", path)
print("Query:", query)
print("Fragment:", fragment)

print("\nParsed Query Parameters:")
print(query_params)

Original URL: https://www.example.com/path/page?param1=value1&param2=value2#fragment

Parsed Components:
Scheme: https
Host: www.example.com
Path: /path/page
Query: param1=value1&param2=value2
Fragment: fragment

Parsed Query Parameters:
{'param1': ['value1'], 'param2': ['value2']}


In [75]:
from urllib.parse import urljoin

print(urljoin('https://www.example.com/', '/relative-path'))
print(urljoin('https://www.example.com/', 'https://www.example.com/relative-path'))

https://www.example.com/relative-path
https://www.example.com/relative-path


## Permutations

In [70]:
import pandas as pd 

hotel_keywords = [
    'hotels',
    'B&Bs',
    'motels',
    'lodges',
    'villas'
]
hotel_types = [
    'cheap',
    'luxury'
]

hotel_classes = [f'{i}-stars' for i in [3,4,5]]
hotel_types += hotel_classes

geos = [
    'Boston',
    'New-York',
    'Las Vegas'
    ]

results = []

for hotel in hotel_keywords:
    for hotel_type in hotel_types:
        for geo in geos:
            results.append(f'{hotel}: {hotel_type} {hotel} in {geo}')

df = pd.DataFrame(results)
df.head()

Unnamed: 0,0
0,hotels: cheap hotels in Boston
1,hotels: cheap hotels in New-York
2,hotels: cheap hotels in Las Vegas
3,hotels: luxury hotels in Boston
4,hotels: luxury hotels in New-York


## Manipulate DataFrames

In [71]:
print(df.head(2))

# lowercase column
df[0] = df[0].str.lower()
print(df.head(2))

# Split column
df = df[0].str.split(':',expand=True)
print(df.head(2))

# Rename column
df.columns = ['hotel_type','query']
print(df.head(2))

# Regex Extract
df[['query_cat','geo']] = df['query'].str.extract(r'(.*) in (.*)',expand=True)
print(df.head(2))

# Feature engineering
domain = 'https://example.com/'
df['url'] = domain + df['geo'].str.replace(' ','-') + '/' + df['query_cat'].str.strip().str.replace(' ','-')

df.head(10)

                                  0
0    hotels: cheap hotels in Boston
1  hotels: cheap hotels in New-York
                                  0
0    hotels: cheap hotels in boston
1  hotels: cheap hotels in new-york
        0                          1
0  hotels     cheap hotels in boston
1  hotels   cheap hotels in new-york
  hotel_type                      query
0     hotels     cheap hotels in boston
1     hotels   cheap hotels in new-york
  hotel_type                      query      query_cat       geo
0     hotels     cheap hotels in boston   cheap hotels    boston
1     hotels   cheap hotels in new-york   cheap hotels  new-york


Unnamed: 0,hotel_type,query,query_cat,geo,url
0,hotels,cheap hotels in boston,cheap hotels,boston,https://example.com/boston/cheap-hotels
1,hotels,cheap hotels in new-york,cheap hotels,new-york,https://example.com/new-york/cheap-hotels
2,hotels,cheap hotels in las vegas,cheap hotels,las vegas,https://example.com/las-vegas/cheap-hotels
3,hotels,luxury hotels in boston,luxury hotels,boston,https://example.com/boston/luxury-hotels
4,hotels,luxury hotels in new-york,luxury hotels,new-york,https://example.com/new-york/luxury-hotels
5,hotels,luxury hotels in las vegas,luxury hotels,las vegas,https://example.com/las-vegas/luxury-hotels
6,hotels,3-stars hotels in boston,3-stars hotels,boston,https://example.com/boston/3-stars-hotels
7,hotels,3-stars hotels in new-york,3-stars hotels,new-york,https://example.com/new-york/3-stars-hotels
8,hotels,3-stars hotels in las vegas,3-stars hotels,las vegas,https://example.com/las-vegas/3-stars-hotels
9,hotels,4-stars hotels in boston,4-stars hotels,boston,https://example.com/boston/4-stars-hotels


## Read and Store CSVs


In [73]:
df.to_csv('files/example_file.csv')
a_csv = pd.read_csv('files/example_file.csv',index_col=0)
a_csv

Unnamed: 0,hotel_type,query,query_cat,geo,url
0,hotels,cheap hotels in boston,cheap hotels,boston,https://example.com/boston/cheap-hotels
1,hotels,cheap hotels in new-york,cheap hotels,new-york,https://example.com/new-york/cheap-hotels
2,hotels,cheap hotels in las vegas,cheap hotels,las vegas,https://example.com/las-vegas/cheap-hotels
3,hotels,luxury hotels in boston,luxury hotels,boston,https://example.com/boston/luxury-hotels
4,hotels,luxury hotels in new-york,luxury hotels,new-york,https://example.com/new-york/luxury-hotels
...,...,...,...,...,...
70,villas,4-stars villas in new-york,4-stars villas,new-york,https://example.com/new-york/4-stars-villas
71,villas,4-stars villas in las vegas,4-stars villas,las vegas,https://example.com/las-vegas/4-stars-villas
72,villas,5-stars villas in boston,5-stars villas,boston,https://example.com/boston/5-stars-villas
73,villas,5-stars villas in new-york,5-stars villas,new-york,https://example.com/new-york/5-stars-villas


## Conclusion

Help me and subscribe to this channel.

Stay tuned for my upcoming Python for SEO course.

### [jcchouinard.com](https://www.jcchouinard.com/)
### [youtube.com/@jcchouinard](https://www.youtube.com/@jcchouinard)
### [twitter.com/ChouinardJC](https://twitter.com/ChouinardJC)
### [linkedin.com/in/jeanchristophechouinard](https://www.linkedin.com/in/jeanchristophechouinard)
