# ETL process(Web Scrapping)

### import dependencies

In [1]:
import pandas as pd
from sqlalchemy import create_engine

url = "https://www.statecancerprofiles.cancer.gov/incidencerates/index.php?stateFIPS=00&areatype=county&cancer=055&race=00&sex=2&age=001&stage=999&year=0&type=incd&sortVariableName=rate&sortOrder=default&output=0#results"

## Extract Process

### Extract data from web page using pandas

In [3]:
breast_cancer_result = pd.read_html(url)
breast_cancer_result

[                                     State  \
 0                        US (SEER+NPCR)  1   
 1             Richmond County, Virginia  6   
 2                Fallon County, Montana  6   
 3               Phelps County, Nebraska  6   
 4               Pickens County, Alabama  6   
 ...                                    ...   
 3137           Wright County, Minnesota  6   
 3138           Wyandotte County, Kansas  6   
 3139   Yakutat City and Borough, Alaska  6   
 3140  Yellow Medicine County, Minnesota  6   
 3141       Ziebach County, South Dakota  6   
 
      Met Healthy People Objective of ***?  \
 0                                     ***   
 1                                     ***   
 2                                     ***   
 3                                     ***   
 4                                     ***   
 ...                                   ...   
 3137                                  ***   
 3138                                  ***   
 3139               

<IPython.core.display.Javascript object>

In [4]:
type(breast_cancer_result)


list

<IPython.core.display.Javascript object>

## Transform Process

### view the data result

In [5]:
df = breast_cancer_result[0]
df.head()

Unnamed: 0,State,Met Healthy People Objective of ***?,"Age-Adjusted Incidence Rate†cases per 100,000(95% Confidence Interval)",CI*Rank&fork;(95% Confidence Interval),Average Annual Count,Recent Trend,Recent 5-Year Trend‡ in Incidence Rates(95% Confidence Interval)
0,US (SEER+NPCR) 1,***,"126.8 (126.6, 127.0)",,249261,rising,"0.3 (0.2, 0.5)"
1,"Richmond County, Virginia 6",***,"269.2 (201.8, 352.7)",,14,stable,"54.0 (-5.1, 149.8)"
2,"Fallon County, Montana 6",***,"246.4 (149.8, 383.5)",,4,*,*
3,"Phelps County, Nebraska 6",***,"210.4 (159.1, 273.9)",,13,stable,"0.9 (-4.4, 6.5)"
4,"Pickens County, Alabama 6",***,"207.7 (171.6, 249.5)",,26,rising,"5.5 (2.5, 8.6)"


<IPython.core.display.Javascript object>

### rename the columns

In [28]:
df.rename(
    columns={
        "Met Healthy People Objective of ***?": "objective",
        "Age-Adjusted Incidence Rate†cases per 100,000(95% Confidence Interval)": "age-adjust rate",
        "CI*Rank&fork;(95% Confidence Interval)": "CI",
        "Recent 5-Year Trend‡ in Incidence Rates(95% Confidence Interval)": "Incidence Rates",
    },
inplace=True)


<IPython.core.display.Javascript object>

In [34]:
df.columns

Index(['State', 'age-adjust rate', 'Average Annual Count', 'Recent Trend'], dtype='object')

<IPython.core.display.Javascript object>

### drop unwanted columns 

In [37]:
df.drop(axis=1,labels=['objective','CI','Incidence Rates'],inplace=True)


        






KeyError: "['objective' 'CI' 'Incidence Rates'] not found in axis"

<IPython.core.display.Javascript object>

### verify if columns are dropped

In [36]:
df.head()

Unnamed: 0,State,age-adjust rate,Average Annual Count,Recent Trend
0,US (SEER+NPCR) 1,"126.8 (126.6, 127.0)",249261,rising
1,"Richmond County, Virginia 6","269.2 (201.8, 352.7)",14,stable
2,"Fallon County, Montana 6","246.4 (149.8, 383.5)",4,*
3,"Phelps County, Nebraska 6","210.4 (159.1, 273.9)",13,stable
4,"Pickens County, Alabama 6","207.7 (171.6, 249.5)",26,rising


<IPython.core.display.Javascript object>

### create connection to database 

In [39]:
engine = create_engine("postgresql://postgres:51559565@localhost:5432/breastcancerdb")
connection = engine.connect()

<IPython.core.display.Javascript object>

## Load Process

### load data from data frame to postgres database breastcancerdb

In [40]:
df.to_sql(name="breastcancer_record", con=engine, if_exists="append", index=False)

<IPython.core.display.Javascript object>

### verify if data is loaded 

In [41]:
pd.read_sql_query('select * from public."breastcancer_record"', con=engine).head()

Unnamed: 0,State,age-adjust rate,Average Annual Count,Recent Trend
0,US (SEER+NPCR) 1,"126.8 (126.6, 127.0)",249261,rising
1,"Richmond County, Virginia 6","269.2 (201.8, 352.7)",14,stable
2,"Fallon County, Montana 6","246.4 (149.8, 383.5)",4,*
3,"Phelps County, Nebraska 6","210.4 (159.1, 273.9)",13,stable
4,"Pickens County, Alabama 6","207.7 (171.6, 249.5)",26,rising


<IPython.core.display.Javascript object>