# Web scraping for all Nvidia GPU

### Import libraries

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

### Set target webdsite

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units"

### Using pd.read_html() to convert the table to dataframe

In [3]:
# Save it as df
df = pd.read_html(url)

# Check the length
print(len(df))

99


### Using "match" to find the df we want

In [4]:
# Take GeForce 30 series as example
match = "10496:328:112:328:82"

# Read html again
df = pd.read_html(url, match=match)[0]

# Print it
df

Unnamed: 0_level_0,Model,Launch,Code name,Process,Transistors (billion),Die size (mm2),Core config[a],Bus interface,L2 Cache(MB),Clock speeds,...,Processing power (TFLOPS),Processing power (TFLOPS),Processing power (TFLOPS),Ray-tracing Performance,Ray-tracing Performance,Ray-tracing Performance,TDP (Watts),NVLink support,Release price (USD),Release price (USD)
Unnamed: 0_level_1,Model,Launch,Code name,Process,Transistors (billion),Die size (mm2),Core config[a],Bus interface,L2 Cache(MB),Base core clock (MHz),...,Double precision,Half precision,Tensor compute (FP16) (2:1 sparse),Rays/s (Billions),RTX OPS/s (Trillions),Ray Perf TFLOPS,TDP (Watts),NVLink support,MSRP,Founders Edition
0,GeForce RTX 3060[152],"February 25, 2021",GA106-300-A1,Samsung8N,13.25,300.0,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25,170,No,,$329
1,GeForce RTX 3060[152],"September 1, 2021",GA104-150-A1[153],Samsung8N,17.4,392.5,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25,170,No,,$329
2,GeForce RTX 3060 Ti[154],"December 2, 2020",GA104-200-A1,Samsung8N,17.4,392.5,4864:152:80:152:38(38) (6),PCIe 4.0 x16,4,1410,...,0.214 0.253,13.70 16.20,? 129.6,,,,200,No,,$399
3,GeForce RTX 3070[155],"October 29, 2020[156]",GA104-300-A1,Samsung8N,17.4,392.5,5888:184:96:184:46(46) (6),PCIe 4.0 x16,4,1500,...,0.276 0.318,17.66 20.37,141.31 162.98,,,40[157],220,No,,$499
4,GeForce RTX 3070 Ti[158],"June 10, 2021",GA104-400-A1,Samsung8N,17.4,392.5,6144:192:96:192:48(48) (6),PCIe 4.0 x16,4,1575,...,0.302 0.340,19.35 21.75,,,,,290,No,,$599
5,GeForce RTX 3080[159],"September 17, 2020",GA102-200-KD-A1,Samsung8N,28.3,628.4,8704:272:96:272:68(68) (7),PCIe 4.0 x16,5,1440,...,0.392 0.465,25.06 29.76,200.54 238.14,,,,320,No,,$699
6,GeForce RTX 3080 Ti[160],"June 3, 2021",GA102-225-A1,Samsung8N,28.3,628.4,10240:320:112:320:80(80) (7),PCIe 4.0 x16,6,1395,...,0.438 0.533,28.06 34.10,,,,,350,No,,$1199
7,GeForce RTX 3090[161],"September 24, 2020",GA102-300-A1,Samsung8N,28.3,628.4,10496:328:112:328:82(82) (7),PCIe 4.0 x16,6,1395,...,0.459 0.558,29.38 35.68,235.08 285.48,,,69[157],350,2-way NVLink,,$1499


### Check the df info

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 29 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   (Model, Model)                                                   8 non-null      object 
 1   (Launch, Launch)                                                 8 non-null      object 
 2   (Code name, Code name)                                           8 non-null      object 
 3   (Process, Process)                                               8 non-null      object 
 4   (Transistors (billion), Transistors (billion))                   8 non-null      float64
 5   (Die size (mm2), Die size (mm2))                                 8 non-null      float64
 6   (Core config[a], Core config[a])                                 8 non-null      object 
 7   (Bus interface, Bus interface)                  

### Remove the upper column for better demonstration

In [6]:
df.columns = df.columns.get_level_values(1)

# Print the df
df

Unnamed: 0,Model,Launch,Code name,Process,Transistors (billion),Die size (mm2),Core config[a],Bus interface,L2 Cache(MB),Base core clock (MHz),...,Double precision,Half precision,Tensor compute (FP16) (2:1 sparse),Rays/s (Billions),RTX OPS/s (Trillions),Ray Perf TFLOPS,TDP (Watts),NVLink support,MSRP,Founders Edition
0,GeForce RTX 3060[152],"February 25, 2021",GA106-300-A1,Samsung8N,13.25,300.0,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25,170,No,,$329
1,GeForce RTX 3060[152],"September 1, 2021",GA104-150-A1[153],Samsung8N,17.4,392.5,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25,170,No,,$329
2,GeForce RTX 3060 Ti[154],"December 2, 2020",GA104-200-A1,Samsung8N,17.4,392.5,4864:152:80:152:38(38) (6),PCIe 4.0 x16,4,1410,...,0.214 0.253,13.70 16.20,? 129.6,,,,200,No,,$399
3,GeForce RTX 3070[155],"October 29, 2020[156]",GA104-300-A1,Samsung8N,17.4,392.5,5888:184:96:184:46(46) (6),PCIe 4.0 x16,4,1500,...,0.276 0.318,17.66 20.37,141.31 162.98,,,40[157],220,No,,$499
4,GeForce RTX 3070 Ti[158],"June 10, 2021",GA104-400-A1,Samsung8N,17.4,392.5,6144:192:96:192:48(48) (6),PCIe 4.0 x16,4,1575,...,0.302 0.340,19.35 21.75,,,,,290,No,,$599
5,GeForce RTX 3080[159],"September 17, 2020",GA102-200-KD-A1,Samsung8N,28.3,628.4,8704:272:96:272:68(68) (7),PCIe 4.0 x16,5,1440,...,0.392 0.465,25.06 29.76,200.54 238.14,,,,320,No,,$699
6,GeForce RTX 3080 Ti[160],"June 3, 2021",GA102-225-A1,Samsung8N,28.3,628.4,10240:320:112:320:80(80) (7),PCIe 4.0 x16,6,1395,...,0.438 0.533,28.06 34.10,,,,,350,No,,$1199
7,GeForce RTX 3090[161],"September 24, 2020",GA102-300-A1,Samsung8N,28.3,628.4,10496:328:112:328:82(82) (7),PCIe 4.0 x16,6,1395,...,0.459 0.558,29.38 35.68,235.08 285.48,,,69[157],350,2-way NVLink,,$1499


### Remove square brackets ([]) for all columns

In [7]:
df = df.apply(lambda x : x.replace(r"\[\d*?\]","", regex=True))

df

Unnamed: 0,Model,Launch,Code name,Process,Transistors (billion),Die size (mm2),Core config[a],Bus interface,L2 Cache(MB),Base core clock (MHz),...,Double precision,Half precision,Tensor compute (FP16) (2:1 sparse),Rays/s (Billions),RTX OPS/s (Trillions),Ray Perf TFLOPS,TDP (Watts),NVLink support,MSRP,Founders Edition
0,GeForce RTX 3060,"February 25, 2021",GA106-300-A1,Samsung8N,13.25,300.0,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25.0,170,No,,$329
1,GeForce RTX 3060,"September 1, 2021",GA104-150-A1,Samsung8N,17.4,392.5,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25.0,170,No,,$329
2,GeForce RTX 3060 Ti,"December 2, 2020",GA104-200-A1,Samsung8N,17.4,392.5,4864:152:80:152:38(38) (6),PCIe 4.0 x16,4,1410,...,0.214 0.253,13.70 16.20,? 129.6,,,,200,No,,$399
3,GeForce RTX 3070,"October 29, 2020",GA104-300-A1,Samsung8N,17.4,392.5,5888:184:96:184:46(46) (6),PCIe 4.0 x16,4,1500,...,0.276 0.318,17.66 20.37,141.31 162.98,,,40.0,220,No,,$499
4,GeForce RTX 3070 Ti,"June 10, 2021",GA104-400-A1,Samsung8N,17.4,392.5,6144:192:96:192:48(48) (6),PCIe 4.0 x16,4,1575,...,0.302 0.340,19.35 21.75,,,,,290,No,,$599
5,GeForce RTX 3080,"September 17, 2020",GA102-200-KD-A1,Samsung8N,28.3,628.4,8704:272:96:272:68(68) (7),PCIe 4.0 x16,5,1440,...,0.392 0.465,25.06 29.76,200.54 238.14,,,,320,No,,$699
6,GeForce RTX 3080 Ti,"June 3, 2021",GA102-225-A1,Samsung8N,28.3,628.4,10240:320:112:320:80(80) (7),PCIe 4.0 x16,6,1395,...,0.438 0.533,28.06 34.10,,,,,350,No,,$1199
7,GeForce RTX 3090,"September 24, 2020",GA102-300-A1,Samsung8N,28.3,628.4,10496:328:112:328:82(82) (7),PCIe 4.0 x16,6,1395,...,0.459 0.558,29.38 35.68,235.08 285.48,,,69.0,350,2-way NVLink,,$1499


### Make the Launch column into datatime 

In [8]:
df["Launch"] = pd.to_datetime(df["Launch"])

df

Unnamed: 0,Model,Launch,Code name,Process,Transistors (billion),Die size (mm2),Core config[a],Bus interface,L2 Cache(MB),Base core clock (MHz),...,Double precision,Half precision,Tensor compute (FP16) (2:1 sparse),Rays/s (Billions),RTX OPS/s (Trillions),Ray Perf TFLOPS,TDP (Watts),NVLink support,MSRP,Founders Edition
0,GeForce RTX 3060,2021-02-25,GA106-300-A1,Samsung8N,13.25,300.0,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25.0,170,No,,$329
1,GeForce RTX 3060,2021-09-01,GA104-150-A1,Samsung8N,17.4,392.5,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25.0,170,No,,$329
2,GeForce RTX 3060 Ti,2020-12-02,GA104-200-A1,Samsung8N,17.4,392.5,4864:152:80:152:38(38) (6),PCIe 4.0 x16,4,1410,...,0.214 0.253,13.70 16.20,? 129.6,,,,200,No,,$399
3,GeForce RTX 3070,2020-10-29,GA104-300-A1,Samsung8N,17.4,392.5,5888:184:96:184:46(46) (6),PCIe 4.0 x16,4,1500,...,0.276 0.318,17.66 20.37,141.31 162.98,,,40.0,220,No,,$499
4,GeForce RTX 3070 Ti,2021-06-10,GA104-400-A1,Samsung8N,17.4,392.5,6144:192:96:192:48(48) (6),PCIe 4.0 x16,4,1575,...,0.302 0.340,19.35 21.75,,,,,290,No,,$599
5,GeForce RTX 3080,2020-09-17,GA102-200-KD-A1,Samsung8N,28.3,628.4,8704:272:96:272:68(68) (7),PCIe 4.0 x16,5,1440,...,0.392 0.465,25.06 29.76,200.54 238.14,,,,320,No,,$699
6,GeForce RTX 3080 Ti,2021-06-03,GA102-225-A1,Samsung8N,28.3,628.4,10240:320:112:320:80(80) (7),PCIe 4.0 x16,6,1395,...,0.438 0.533,28.06 34.10,,,,,350,No,,$1199
7,GeForce RTX 3090,2020-09-24,GA102-300-A1,Samsung8N,28.3,628.4,10496:328:112:328:82(82) (7),PCIe 4.0 x16,6,1395,...,0.459 0.558,29.38 35.68,235.08 285.48,,,69.0,350,2-way NVLink,,$1499


### Sort the model by the launch time

In [9]:
# Sort the df by Launch column with the latest at the top
df = df.sort_values(by=["Launch"], ascending=False)

# Reset the index, drop=True meaning drop the old index
df = df.reset_index(drop=True)

df

Unnamed: 0,Model,Launch,Code name,Process,Transistors (billion),Die size (mm2),Core config[a],Bus interface,L2 Cache(MB),Base core clock (MHz),...,Double precision,Half precision,Tensor compute (FP16) (2:1 sparse),Rays/s (Billions),RTX OPS/s (Trillions),Ray Perf TFLOPS,TDP (Watts),NVLink support,MSRP,Founders Edition
0,GeForce RTX 3060,2021-09-01,GA104-150-A1,Samsung8N,17.4,392.5,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25.0,170,No,,$329
1,GeForce RTX 3070 Ti,2021-06-10,GA104-400-A1,Samsung8N,17.4,392.5,6144:192:96:192:48(48) (6),PCIe 4.0 x16,4,1575,...,0.302 0.340,19.35 21.75,,,,,290,No,,$599
2,GeForce RTX 3080 Ti,2021-06-03,GA102-225-A1,Samsung8N,28.3,628.4,10240:320:112:320:80(80) (7),PCIe 4.0 x16,6,1395,...,0.438 0.533,28.06 34.10,,,,,350,No,,$1199
3,GeForce RTX 3060,2021-02-25,GA106-300-A1,Samsung8N,13.25,300.0,3584:112:48:112:28(28) (3),PCIe 4.0 x16,3,1320,...,0.148 0.199,9.46 12.74,,,,25.0,170,No,,$329
4,GeForce RTX 3060 Ti,2020-12-02,GA104-200-A1,Samsung8N,17.4,392.5,4864:152:80:152:38(38) (6),PCIe 4.0 x16,4,1410,...,0.214 0.253,13.70 16.20,? 129.6,,,,200,No,,$399
5,GeForce RTX 3070,2020-10-29,GA104-300-A1,Samsung8N,17.4,392.5,5888:184:96:184:46(46) (6),PCIe 4.0 x16,4,1500,...,0.276 0.318,17.66 20.37,141.31 162.98,,,40.0,220,No,,$499
6,GeForce RTX 3090,2020-09-24,GA102-300-A1,Samsung8N,28.3,628.4,10496:328:112:328:82(82) (7),PCIe 4.0 x16,6,1395,...,0.459 0.558,29.38 35.68,235.08 285.48,,,69.0,350,2-way NVLink,,$1499
7,GeForce RTX 3080,2020-09-17,GA102-200-KD-A1,Samsung8N,28.3,628.4,8704:272:96:272:68(68) (7),PCIe 4.0 x16,5,1440,...,0.392 0.465,25.06 29.76,200.54 238.14,,,,320,No,,$699
