# WebScraping - find_all + pandas

---
* Author:  [Yuttapong Mahasittiwat](mailto:khala1391@gmail.com)
* Technologist | Data Modeler | Data Analyst
* [YouTube](https://www.youtube.com/khala1391)
* [LinkedIn](https://www.linkedin.com/in/yuttapong-m/)
* [Tableau](https://public.tableau.com/app/profile/yuttapong.m/vizzes)
---

ref: [WS CubeTech youtube channel](https://www.youtube.com/watch?v=UabBGhnVqSo&list=PLc20sA5NNOvrsn3a78ewy2VTCXVV47NB4&index=1&t=0s)

In [None]:
import datetime
print(datetime.datetime.now())

2024-10-20 20:41:04.090947


## import library

In [1]:
from urllib.request import urlopen # option#1
import requests  # option#2

import pandas as pd
from bs4 import BeautifulSoup

In [3]:
url = "https://webscraper.io/test-sites/e-commerce/allinone/computers/tablets"

r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")

In [9]:
names = soup.find_all("a", class_="title")
print(names)

# way#1 list comprehension
product_name = [name.text for name in names]

# way#2 for loop
# product_name= []
# for i in names:
#     name = i.text
#     product_name.append(name)


product_name

[<a class="title" href="/test-sites/e-commerce/allinone/product/10" title="Lenovo IdeaTab">Lenovo IdeaTab</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/15" title="IdeaTab A3500L">IdeaTab A3500L</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/11" title="Acer Iconia">Acer Iconia</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/18" title="Galaxy Tab 3">Galaxy Tab 3</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/27" title="Iconia B1-730HD">Iconia B1-730H...</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/23" title="Memo Pad HD 7">Memo Pad HD 7</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/12" title="Asus MeMO Pad">Asus MeMO Pad</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/13" title="Amazon Kindle">Amazon Kindle</a>, <a class="title" href="/test-sites/e-commerce/allinone/product/22" title="Galaxy Tab 3">Galaxy Tab 3</a>, <a class="title" href="/t

['Lenovo IdeaTab',
 'IdeaTab A3500L',
 'Acer Iconia',
 'Galaxy Tab 3',
 'Iconia B1-730H...',
 'Memo Pad HD 7',
 'Asus MeMO Pad',
 'Amazon Kindle',
 'Galaxy Tab 3',
 'IdeaTab A8-50',
 'MeMO Pad 7',
 'IdeaTab A3500-...',
 'IdeaTab S5000',
 'Galaxy Tab 4',
 'Galaxy Tab',
 'MeMo PAD FHD 1...',
 'Galaxy Note',
 'Galaxy Note',
 'iPad Mini Reti...',
 'Galaxy Note 10...',
 'Apple iPad Air']

In [14]:
prices = soup.find_all("h4", class_ = "price float-end card-title pull-right")
price_list = [i.text for i in prices]
price_list

print(price_list)

['$69.99', '$88.99', '$96.99', '$97.99', '$99.99', '$101.99', '$102.99', '$103.99', '$107.99', '$121.99', '$130.99', '$148.99', '$172.99', '$233.99', '$251.99', '$320.99', '$399.99', '$489.99', '$537.99', '$587.99', '$603.99']


In [16]:
descriptions = soup.find_all("p", class_="description")
description_list = [i.text for i in descriptions]
description_list

['7" screen, Android',
 'Black, 7" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2',
 '7" screen, Android, 16GB',
 '7", 8GB, Wi-Fi, Android 4.2, White',
 'Black, 7", 1.6GHz Dual-Core, 8GB, Android 4.4',
 'IPS, Dual-Core 1.2GHz, 8GB, Android 4.3',
 '7" screen, Android, 8GB',
 '6" screen, wifi',
 '7", 8GB, Wi-Fi, Android 4.2, Yellow',
 'Blue, 8" IPS, Quad-Core 1.3GHz, 16GB, Android 4.2',
 'White, 7", Atom 1.2GHz, 8GB, Android 4.4',
 'Blue, 7" IPS, Quad-Core 1.3GHz, 8GB, 3G, Android 4.2',
 'Silver, 7" IPS, Quad-Core 1.2Ghz, 16GB, 3G, Android 4.2',
 'LTE (SM-T235), Quad-Core 1.2GHz, 8GB, Black',
 '16GB, White',
 'White, 10.1" IPS, 1.6GHz, 2GB, 16GB, Android 4.2',
 '10.1", 3G, Android 4.0, Garnet Red',
 '12.2", 32GB, WiFi, Android 4.4, White',
 'Wi-Fi + Cellular, 32GB, Silver',
 '10.1", 32GB, Black',
 'Wi-Fi, 64GB, Silver']

In [17]:
reviews = soup.find_all("p", class_="review-count float-end")
review_list = [i.text for i in reviews]
review_list

['7 reviews',
 '7 reviews',
 '7 reviews',
 '2 reviews',
 '1 reviews',
 '10 reviews',
 '14 reviews',
 '3 reviews',
 '14 reviews',
 '13 reviews',
 '11 reviews',
 '9 reviews',
 '8 reviews',
 '1 reviews',
 '14 reviews',
 '7 reviews',
 '12 reviews',
 '9 reviews',
 '8 reviews',
 '6 reviews',
 '7 reviews']

In [18]:
df = pd.DataFrame({"product_name": product_name,
                  "price": price_list,
                  "review":review_list})
df

Unnamed: 0,product_name,price,review
0,Lenovo IdeaTab,$69.99,7 reviews
1,IdeaTab A3500L,$88.99,7 reviews
2,Acer Iconia,$96.99,7 reviews
3,Galaxy Tab 3,$97.99,2 reviews
4,Iconia B1-730H...,$99.99,1 reviews
5,Memo Pad HD 7,$101.99,10 reviews
6,Asus MeMO Pad,$102.99,14 reviews
7,Amazon Kindle,$103.99,3 reviews
8,Galaxy Tab 3,$107.99,14 reviews
9,IdeaTab A8-50,$121.99,13 reviews


In [19]:
df.to_csv('data/product_details.csv')