In [132]:
# Import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [133]:
# Target URL
url = "https://joshmadison.com/2008/04/20/fortune-cookie-fortunes/"

page = requests.get(url)

soup = BeautifulSoup(page.text, "html")

In [134]:
# Display the html content
print(soup)

<!DOCTYPE html>
<!--[if lt IE 9]> <html class="old-ie" lang="en"> <!![endif]--><!--[if gte IE 9|!(IE)]><!--><html lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="40.780856, -73.948175" name="ICBM"/>
<title>Fortune Cookie Fortunes – joshmadison.com</title>
<link href="/apple-touch-icon-57x57.png" rel="apple-touch-icon-precomposed" sizes="57x57"/>
<link href="/apple-touch-icon-114x114.png" rel="apple-touch-icon-precomposed" sizes="114x114"/>
<link href="/apple-touch-icon-72x72.png" rel="apple-touch-icon-precomposed" sizes="72x72"/>
<link href="/apple-touch-icon-144x144.png" rel="apple-touch-icon-precomposed" sizes="144x144"/>
<link href="/apple-touch-icon-60x60.png" rel="apple-touch-icon-precomposed" sizes="60x60"/>
<link href="/apple-touch-icon-120x120.png" rel="apple-touch-icon-precomposed" sizes="120x120"/>
<link href="/apple-touch-icon

In [135]:
# Find the <li> elements
soup.find_all("li")

[<li><a class="first" href="/about/">About</a></li>,
 <li><a href="/archives/">Archives</a></li>,
 <li><a href="/featured/">Featured</a></li>,
 <li><a href="/photography/">Photos</a></li>,
 <li><a href="/convert-for-windows/">Convert</a></li>,
 <li>A beautiful, smart, and loving person will be coming into your life.</li>,
 <li>A dubious friend may be an enemy in camouflage.</li>,
 <li>A faithful friend is a strong defense.</li>,
 <li>A feather in the hand is better than a bird in the air. (2)</li>,
 <li>A fresh start will put you on your way.</li>,
 <li>A friend asks only for your time not your money.</li>,
 <li>A friend is a present you give yourself.</li>,
 <li>A funny coincidence will make your day.</li>,
 <li>A gambler not only will lose what he has, but also will lose what he doesn’t have.</li>,
 <li>A golden egg of opportunity falls into your lap this month.</li>,
 <li>A good friendship is often more important than a passionate romance.</li>,
 <li>A good time to finish up old tas

In [136]:
# Find all <li> without <a> tags
fortunes_text = []
for li in soup.find_all("li"):
    if not li.find("a"):
        fortunes_text.append(li.get_text(strip=True))

In [137]:
# Display the fortunes text in the list
print(fortunes_text)

['A beautiful, smart, and loving person will be coming into your life.', 'A dubious friend may be an enemy in camouflage.', 'A faithful friend is a strong defense.', 'A feather in the hand is better than a bird in the air. (2)', 'A fresh start will put you on your way.', 'A friend asks only for your time not your money.', 'A friend is a present you give yourself.', 'A funny coincidence will make your day.', 'A gambler not only will lose what he has, but also will lose what he doesn’t have.', 'A golden egg of opportunity falls into your lap this month.', 'A good friendship is often more important than a passionate romance.', 'A good time to finish up old tasks. (2)', 'A hunch is creativity trying to tell you something.', 'A lifetime friend shall soon be made.', 'A lifetime of happiness lies ahead of you.', 'A light heart carries you through all the hard times.', 'A new outlook brightens your image and brings new friends.', 'A new perspective will come with the new year. (2)', 'A person 

In [138]:
# Convert the list to a DataFrame
df = pd.DataFrame(fortunes_text, columns=["Fortune"])

In [139]:
# View the DataFrame
df.head()

Unnamed: 0,Fortune
0,"A beautiful, smart, and loving person will be ..."
1,A dubious friend may be an enemy in camouflage.
2,A faithful friend is a strong defense.
3,A feather in the hand is better than a bird in...
4,A fresh start will put you on your way.


In [140]:
# Remove the data with (sic)
df = df[~df['Fortune'].str.contains('\(sic\)', na=False, regex=True)]

In [141]:
# Remove the data with [sic]
df = df[~df['Fortune'].str.contains('\[sic\]', na=False, regex=True)]

In [142]:
# Remove (2) & (3) in the text
df['Fortune'] = df['Fortune'].str.replace(r'\(\d+\)\s*', '', regex=True)

In [143]:
df

Unnamed: 0,Fortune
0,"A beautiful, smart, and loving person will be ..."
1,A dubious friend may be an enemy in camouflage.
2,A faithful friend is a strong defense.
3,A feather in the hand is better than a bird in...
4,A fresh start will put you on your way.
...,...
367,Your quick wits will get you out of a tough si...
368,Your reputation is your wealth.
369,Your success will astonish everyone.
370,Your talents will be recognized and suitably r...


In [144]:
# Export with UTF-8-BOM encoding (' will display correctly in Excel)
df.to_csv('Fortune_cookies.csv', 
          index=False, 
          encoding='utf-8-sig')  # The magic is in 'utf-8-sig'