# Import and install packages

In [4]:
#import packages
import tabula
import pandas as pd
import time
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup as bs
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
!pip install --upgrade plotly
!pip install --upgrade geopandas
!pip install --upgrade pyshp
!pip install --upgrade shapely

# Counts of Cancer per US State 2016-2020

In [2]:
#Cancer Occurence per US State

url ="https://acsjournals.onlinelibrary.wiley.com/doi/10.3322/caac.21763"


async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    #print(soup)

    #uses beautiful soup package to go through website and extract out table
    #appends data to list
    data = []
    m= soup.find(id="caac21763-tbl-0002", class_="article-table-content") 
    for d in m.find_all('div',{'class':"article-table-content-wrapper"}):
        table = m.find('table', attrs={'class':'table article-section__table'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values
    


In [3]:
#converts list to dataframe
df = pd.DataFrame (data, columns = ['State', 'All sites', 'Female breast', 'Colon & rectum', 'Leukemia', 'Lung & bronchus', 'Melanoma of the skin', 'Non-Hodgkin lymphoma'
                                   ,'Prostate', 'Urinary bladder', 'Uterine cervix', 'Uterine corpus'])



In [4]:
#replaces value
df=df.replace('—b', '< 50')

#top 3 cancers in USA:
    #1. Breast
    #2. Prostate
    #3. Lung & Bronchus

#gets rid of last row
states = df.iloc[:-1 , :]
#replace char
states= states.replace(',','', regex=True)

#makes columns type int
c=0
for i in states:
    if(c!=0 and c!=10):
        states[i]=states[i].astype('Int64')
    c+=1
        

## Population of US States 2020

In [62]:
#POPULATION OF US STATES 2020

url = "https://ballotpedia.org/United_States_census,_2020"



async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    #print(soup)

    #uses beautiful soup package to go through website and extract out table
    #appends data to list
    data = []
    
    for d in soup.find_all('div',{'class':"mw-parser-output"}):
        table = d.find('table', attrs={'class':'bptable collapsible sortable jquery-tablesorter'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values

In [63]:
#converts list to dataframe
popstates2020 = pd.DataFrame (data, columns = ['State', '2020 Population','A','B','C'])
#delete columns
del popstates2020['A']
del popstates2020['B']
del popstates2020['C']
#replace char and convert the column of type int
popstates2020['2020 Population'] = popstates2020['2020 Population'].str.replace(',', '').astype(int)
#resets index and deletes column
popstates2020=popstates2020.reset_index()
del popstates2020['index']

## Choropleth Maps for all US States

In [49]:
#US codes w/ D.C
code=['AL', 'AK','AZ', 'AR', 'CA' ,'CO', 'CT','DC', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 
     'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

In [60]:
#US codes w/o D.C
code2=['AL', 'AK','AZ', 'AR', 'CA' ,'CO', 'CT','DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 
     'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

In [52]:
#make col type int
sortedStates['All sites']=sortedStates['All sites'].astype(int)

### 2020 Population choropleth maps

In [67]:
#heatmap for 2020 population per state
fig = px.choropleth(popstates2020, locations=code2,
                    locationmode="USA-states", color='2020 Population',color_continuous_scale="Viridis", scope="usa")
#on click of state goes to heatmap for that state and its county
fig.show()

### Cancer per US State choropleth map 

In [57]:
#heatmap for number of cancer per state
fig = px.choropleth(states, locations=code,
                    locationmode="USA-states", color='All sites',color_continuous_scale="Sunsetdark", scope="usa")
#on click of state goes to heatmap for that state and its county
fig.show()

In [13]:
#https://cancerstatisticscenter.cancer.org/#!/state/New%20York

# Annual Cancer per County in NY 

In [6]:
#read in pdf of annual cancer cases per county in nys
df=tabula.read_pdf("newyork/volume1.pdf", pages="all")  
#pages of pdfs
c=[3,4,5,6,7,8,35,54]


In [7]:
#assigns name to var
def na(i):
    if(i==3):
        name='Bronx'
    elif(i==4):
        name='Brooklyn'
    elif(i==5):
        name='Manhattan'
    elif(i==6):
        name='Queens'
    elif(i==7):
        name='Staten Island'
    elif(i==8):
        name='Albany'
    elif(i==35):
        name='Nassau'
    elif(i==54):
            name='Suffolk'
    return name


In [8]:
#extracts data for given borough, i
def extract(i):
    name=na(i)
    nys=df[i].iloc[:, [0, 1]]
    nys=nys[3:]
    nys.columns = nys.iloc[0]
    nys = nys[1:]
    nys.rename(columns={nys.columns[1]: name},inplace=True)
    site=nys.iloc[:, [0]]
    nys=nys[name].str.split(' ', expand=True)
    nys['Site of Cancer']=site
    del nys[1]
    del nys[2]
    nys=nys.rename(columns={0:name})
    first_column = nys.pop('Site of Cancer')
    nys.insert(0, 'Site of Cancer', first_column)
    return nys
        
        
            

In [9]:
#extracts data for females for given borough, i
def female(i):
    name=na(i)
    nys=df[i].iloc[:, [0, 5]]
    nys=nys[3:]
    nys.columns = nys.iloc[0]
    nys = nys[1:]
    nys.rename(columns={nys.columns[1]: name},inplace=True)
    site=nys.iloc[:, [0]]
    nys=nys[name].str.split(' ', expand=True)
    nys['Site of Cancer']=site
    #del nys[1]

    nys=nys.rename(columns={0:name})
    first_column = nys.pop('Site of Cancer')
    nys.insert(0, 'Site of Cancer', first_column)
    return nys

In [10]:
#extracts data for males for given borough, i
def male(i):
    name=na(i)
    nys=df[i].iloc[:, [0, 2]]
    nys=nys[3:]
    nys.columns = nys.iloc[0]
    nys = nys[1:]
    nys.rename(columns={nys.columns[1]: name},inplace=True)
    site=nys.iloc[:, [0]]
    nys=nys[name].str.split(' ', expand=True)
    nys['Site of Cancer']=site
    #del nys[1]

    nys=nys.rename(columns={0:name})
    first_column = nys.pop('Site of Cancer')
    nys.insert(0, 'Site of Cancer', first_column)
    return nys

## M&F County extraction

In [11]:
#male and female

b=extract(3)
br=extract(4)
m=extract(5)
q=extract(6)
s=extract(7)
a=extract(8)
n=extract(35)
su=extract(54)

county=pd.merge(b, br, on=['Site of Cancer'])
county=pd.merge(county, m, on=['Site of Cancer'])
county=pd.merge(county, q, on=['Site of Cancer'])
county=pd.merge(county, s, on=['Site of Cancer'])
county=pd.merge(county, a, on=['Site of Cancer'])
county=pd.merge(county, n, on=['Site of Cancer'])
county=pd.merge(county, su, on=['Site of Cancer'])

#avg annual cancer cases
county=county.drop(index=[12,13,14,15,16,17])

## Female County extraction

In [12]:
#just female

b=female(3)
br=female(4)
m=female(5)
q=female(6)
s=female(7)
a=female(8)
n=female(35)
su=female(54)

f_county=pd.merge(b, br, on=['Site of Cancer'])
f_county=pd.merge(f_county, m, on=['Site of Cancer'])
f_county=pd.merge(f_county, q, on=['Site of Cancer'])
f_county=pd.merge(f_county, s, on=['Site of Cancer'])
f_county=pd.merge(f_county, a, on=['Site of Cancer'])
f_county=pd.merge(f_county, n, on=['Site of Cancer'])
f_county=pd.merge(f_county, su, on=['Site of Cancer'])

#avg annual cancer cases
f_county=f_county.drop(index=[16,17])

## Male County extraction

In [13]:
#just male

b=male(3)
br=male(4)
m=male(5)
q=male(6)
s=male(7)
a=male(8)
n=male(35)
su=male(54)

m_county=pd.merge(b, br, on=['Site of Cancer'])
m_county=pd.merge(m_county, m, on=['Site of Cancer'])
m_county=pd.merge(m_county, q, on=['Site of Cancer'])
m_county=pd.merge(m_county, s, on=['Site of Cancer'])
m_county=pd.merge(m_county, a, on=['Site of Cancer'])
m_county=pd.merge(m_county, n, on=['Site of Cancer'])
m_county=pd.merge(m_county, su, on=['Site of Cancer'])

#avg annual cancer cases
m_county=m_county.drop(index=[12,13,14,15])

In [24]:
# locate male and female dfs

In [14]:
county

Unnamed: 0,Site of Cancer,Bronx,Brooklyn,Manhattan,Queens,Staten Island,Albany,Nassau,Suffolk
0,All Invasive Malignant Tumors,6325.0,12018.8,8601.8,11454.4,3041.6,1986.4,8934.0,9997.2
1,Oral cavity and pharynx,150.4,254.8,217.0,249.0,60.4,57.6,195.4,237.4
2,Esophagus,48.0,83.8,63.8,92.0,28.4,22.6,71.0,89.2
3,Stomach,144.8,301.8,142.8,324.4,47.0,28.0,150.4,146.6
4,Colorectal,521.2,1047.2,614.6,1000.2,240.6,140.0,660.4,755.2
5,Colon excluding rectum,365.4,740.8,421.4,705.2,177.4,102.2,480.8,535.4
6,Rectum & rectosigmoid,155.8,306.4,193.2,295.0,63.2,37.8,179.6,219.8
7,Liver / intrahepatic bile duct,218.2,276.4,197.0,275.4,65.2,37.8,121.4,155.6
8,Pancreas,200.8,384.6,263.2,369.2,94.2,64.0,299.4,315.2
9,Larynx,55.4,65.8,44.0,60.6,17.6,12.8,45.4,58.0


## Male and Female Populations per County

In [15]:
#uses beautiful soup package to go through website and extract out table
#appends data to list
url='https://www.health.ny.gov/statistics/cancer/registry/appendix/countypop.htm'
data=[]
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    #print(soup)
    for d in soup.find_all('div',{'id':"content"}):
        table = d.find('table', attrs={'class':'light_table right'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values


In [18]:
#slice list
d1=data[3:9]
d1.append(data[35])
d1.append(data[54])

In [19]:
#list of boroughs
names=['Bronx','Brooklyn','Manhattan','Queens','Staten Island', 'Albany','Nassau','Suffolk']

In [20]:
#converts list to dataframe
pop= pd.DataFrame (d1, columns = ['Males','Female','Total Population'])

In [21]:
#assign list to column in dataframe
pop['Counties']=names

In [22]:
#replace columns chars w another char and convert to type int
pop['Males'] = pop['Males'].str.replace(',', '').astype(int)
pop['Female'] = pop['Female'].str.replace(',', '').astype(int)
pop['Total Population'] = pop['Total Population'].str.replace(',', '').astype(int)
#pop['Males']=pop['Males'].astype(float)

In [23]:
#sort values in column by descending
pop=pop.sort_values('Total Population', ascending=False)

### Bar Graph of Population

In [25]:

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Bar(x=pop['Counties'],
                     y=pop["Males"],
                     name="Male"))
fig.add_trace(go.Bar(x=pop['Counties'],
                     y=pop["Female"],
                     name="Female"))
fig.add_trace(go.Bar(x=pop['Counties'],
                     y=pop["Total Population"],
                     name="Total Population"))
fig.update_layout(
title='NY Counties Population 2016-2020')

In [None]:
#https://statecancerprofiles.cancer.gov/incidencerates/index.php?stateFIPS=36&areatype=county&cancer=001&race=05&sex=0&age=001&stage=999&year=0&type=incd&sortVariableName=rate&sortOrder=default&output=0#results

In [26]:
url='https://projects.newsday.com/databases/long-island/cancer-rates/#:~:text=An%20average%20of%20more%20than,during%20the%20same%20time%20period.'
data=[]
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    for d in soup.find_all('div',{'class':"twrap"}):
        table = d.find('table', attrs={'id':'nmgDB'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values

In [27]:
#converts list to dataframe
longisland= pd.DataFrame (data, columns = ['All Malignant Tumors','County','Males average annual cases', 'Case rate per 100,000 males','Female average annual cases','Case rate per 100,000 females','Male average annual deaths','Death rate per 100,000 males','Female average annual deaths','Death rate per 100,000 females'])

In [28]:
#replace values in columns
longisland['All Malignant Tumors']=longisland['All Malignant Tumors'].str.replace('Site of cancer\t\t\t\t\t\t\t\n\n','')
longisland['County']=longisland['County'].str.replace('County\t\t\t\t\t\t\t\n\n','')
longisland['Case rate per 100,000 males']=longisland['Case rate per 100,000 males'].str.replace('Case rate per 100,000 males\t\t\t\t\t\t\t\n\n','')
longisland['Female average annual cases']=longisland['Female average annual cases'].str.replace('Female average annual cases\t\t\t\t\t\t\t\n\n','')
longisland['Case rate per 100,000 females']=longisland['Case rate per 100,000 females'].str.replace('Case rate per 100,000 females\t\t\t\t\t\t\t\n\n','')
longisland['Male average annual deaths']=longisland['Male average annual deaths'].str.replace('Male average annual deaths\t\t\t\t\t\t\t\n\n','')
longisland['Death rate per 100,000 males']=longisland['Death rate per 100,000 males'].str.replace('Death rate per 100,000 males\t\t\t\t\t\t\t\n\n','')
longisland['Female average annual deaths']=longisland['Female average annual deaths'].str.replace('Female average annual deaths\t\t\t\t\t\t\t\n\n','')
longisland['Death rate per 100,000 females']=longisland['Death rate per 100,000 females'].str.replace('Death rate per 100,000 females\t\t\t\t\t\t\t\n\n','')
longisland['Males average annual cases']=longisland['Males average annual cases'].str.replace('Males average annual cases\t\t\t\t\t\t\t\n\n','')

In [29]:
#delete columns
del longisland['Case rate per 100,000 males']
del longisland['Case rate per 100,000 females']
del longisland['Death rate per 100,000 males']
del longisland['Death rate per 100,000 females']


#change to string datatype
longisland['County']=longisland['County'].astype(str)

In [30]:
#locate rows that contain values
suffolk=longisland.loc[longisland['County'].str.contains('Suffolk')]
nassau=longisland.loc[longisland['County'].str.contains('Nassau')]

In [31]:
#population in Suffolk is approx 146,000 people greater than Nassau so cancer rates should be greater 
#Prostate Cancer is the only one greater in Nassau than in Suffolk so we have to find out why 
longisland

Unnamed: 0,All Malignant Tumors,County,Males average annual cases,Female average annual cases,Male average annual deaths,Female average annual deaths
0,All Invasive Malignant Tumors,Nassau,4332.4,4470.6,1181.4,1261.2
1,All Invasive Malignant Tumors,Suffolk,4824.4,4753.0,1388.2,1360.8
2,Brain and other central nervou...,Nassau,57.0,44.8,38.0,31.4
3,Brain and other central nervou...,Suffolk,69.2,56.0,43.4,31.6
4,Cervix uteri,Nassau,,47.6,,12.6
5,Cervix uteri,Suffolk,,56.8,,20.0
6,Colon excluding rectum,Nassau,231.4,255.0,84.0,96.4
7,Colon excluding rectum,Suffolk,264.6,256.8,99.4,91.2
8,Colorectal,Nassau,341.6,342.6,105.8,108.0
9,Colorectal,Suffolk,380.6,337.8,120.4,104.4


# Florida Extraction

In [5]:
fl1=tabula.read_pdf("florida/Florida 2016 cases.pdf", pages="all")  
fl2=tabula.read_pdf("florida/Florida 2017 cases.pdf", pages="all")  
fl3=tabula.read_pdf("florida/Florida 2018 cases.pdf", pages="all")  
fl4=tabula.read_pdf("florida/Florida 2019 cases.pdf", pages="all")  
fl5=tabula.read_pdf("florida/Florida 2020 cases.pdf", pages="all")  

In [6]:
def florida_extract(fl):
    florida=fl[0]
    florida.columns = florida.iloc[0]
    florida = florida[1:]
    florida.rename(columns={florida.columns[0]: 'County'},inplace=True)
    florida=florida[1:]
    florida=florida.replace('^','0')
    #florida['Cancers'] = florida['Cancers'].str.replace(',', '').astype(int)
    return florida

In [7]:
fl1=florida_extract(fl1)
fl2=florida_extract(fl2)
fl3=florida_extract(fl3)
fl4=florida_extract(fl4)
fl5=florida_extract(fl5)

In [8]:
fl1=fl1.rename(columns={'All Cancers':'Cancers'})

In [9]:
florida=pd.DataFrame()

In [10]:
k=0
for i in fl1.columns:
    if(k!=0):
        fl1[i] = fl1[i].str.replace(',', '').astype(int)
        fl2[i] = fl2[i].str.replace(',', '').astype(int)
        fl3[i] = fl3[i].str.replace(',', '').astype(int)
        fl4[i] = fl4[i].str.replace(',', '').astype(int)
        fl5[i] = fl5[i].str.replace(',', '').astype(int)
        florida[i]=fl1[i]+fl2[i]+fl3[i]+fl4[i]+fl5[i]
    else:
        florida[i]=fl1[i]
    k+=1
    

In [11]:
florida.to_csv('Florida Cancer 2016-2020.csv')

In [12]:
import plotly.figure_factory as ff

In [13]:
county_fl=florida['County'].tolist()

In [14]:
fips=['12001', '12003', '12005', '12007', '12009', '12011', '12013', '12015', '12017', '12019', '12021', '12023','12027', '12029', '12031','12033','12035','12037','12039','12041','12043','12045','12047','12049','12051','12053','12055',
      '12057','12059','12061','12063','12065','12067','12069','12071','12073','12075','12077','12079','12081','12083',
      '12085','12086','12087','12089','12091','12093','12095','12097','12099','12101','12103','12105','12107','12109','12111',
      '12113','12115','12117','12119','12121','12123','12125','12127','12129','12131','12133']

In [15]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
import numpy as np

In [17]:
values=florida['Cancers'].tolist()

In [18]:
endpts = list(np.mgrid[min(values):max(values):6j])

In [19]:
import plotly as py
import plotly.graph_objects as go

In [20]:
florida.sort_values('Cancers', ascending=False)

Unnamed: 0,County,Cancers,Bronchus,Prostate,Breast,Colorectal,Bladder,Neck,Hodgkin,Melanoma,Ovary,Cervix
44,Miami-Dade,66624,7059,7704,9609,6214,2489,2569,3321,1266,913,759
7,Broward,47640,5170,4778,7042,3983,1878,1881,2217,2084,601,463
51,Palm Beach,45875,5636,4912,6627,3368,2461,1675,2298,2632,608,313
29,Hillsborough,38160,4649,3947,5343,3179,1508,1552,1733,1941,498,364
53,Pinellas,36670,5163,3567,5220,2746,1811,1678,1644,2231,450,246
...,...,...,...,...,...,...,...,...,...,...,...,...
30,Holmes,402,90,0,24,11,0,0,0,0,0,0
8,Calhoun,392,81,0,23,25,0,0,0,0,0,0
22,Glades,320,13,0,0,0,0,0,0,69,0,0
39,Liberty,199,24,0,0,0,0,0,0,0,0,0


In [21]:
#heatmap for 2016-2020 population per county in FL
fig =ff.create_choropleth(fips=fips,values=values,scope=["Florida"],binning_endpoints=endpts, show_state_data=True,county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},round_legend_values=True,title='Count of Cancer per County in Florida from 2016-2020')
#on click of state goes to heatmap for that state and its county
py.offline.plot(fig,
                filename='choropleth_california_and_surr_states_outlines',
                include_plotlyjs='https://cdn.plot.ly/plotly-1.42.3.min.js')

'choropleth_california_and_surr_states_outlines.html'

In [24]:
fig.write_html("Desktop/Florida.html")

In [22]:
max_florida=pd.DataFrame(columns=['County'])

In [23]:
max_florida['County']=florida['County']

In [24]:
florida_cluster=florida.copy()
del florida_cluster['Cancers']
del florida_cluster['County']

In [25]:
max_florida['Max Column']=florida_cluster.idxmax(axis=1)

In [26]:
max_florida.groupby(max_florida['Max Column']).count()

Unnamed: 0_level_0,County
Max Column,Unnamed: 1_level_1
Breast,15
Bronchus,50
Melanoma,1
Prostate,1


In [27]:
max_florida

Unnamed: 0,County,Max Column
2,Alachua,Breast
3,Baker,Bronchus
4,Bay,Bronchus
5,Bradford,Bronchus
6,Brevard,Bronchus
...,...,...
64,Union,Prostate
65,Volusia,Bronchus
66,Wakulla,Bronchus
67,Walton,Bronchus


In [None]:
max_florida.loc[df['pop'] < 2.e6, 'country'] = 'Other countries' # Represent only large countries
fig = px.pie(df, values='pop', names='country', title='Population of European continent')
fig.show()

In [28]:
max_florida.loc[max_florida['Max Column']=='Melanoma']

Unnamed: 0,County,Max Column
22,Glades,Melanoma


In [31]:
florida.loc[max_florida['Max Column']=='Prostate']

Unnamed: 0,County,Cancers,Bronchus,Prostate,Breast,Colorectal,Bladder,Neck,Hodgkin,Melanoma,Ovary,Cervix
64,Union,1124,183,189,57,122,12,104,11,0,0,0


In [33]:
florida.loc[florida['County']=='Sumter']

Unnamed: 0,County,Cancers,Bronchus,Prostate,Breast,Colorectal,Bladder,Neck,Hodgkin,Melanoma,Ovary,Cervix
61,Sumter,7112,1081,879,1107,457,418,250,264,256,84,0


In [32]:
max_florida.loc[max_florida['Max Column']=='Breast']

Unnamed: 0,County,Max Column
2,Alachua,Breast
7,Broward,Breast
12,Collier,Breast
16,Duval,Breast
29,Hillsborough,Breast
37,Leon,Breast
41,Manatee,Breast
44,Miami-Dade,Breast
49,Orange,Breast
50,Osceola,Breast


In [33]:
url='https://www.flhealthcharts.gov/ChartsDashboards/rdPage.aspx?rdReport=NonVitalIndRateOnly.TenYrsRpt&cid=300'
data=[]
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    for d in soup.find_all('div',{'id':"rdDataTableDiv-dtTenYrsDataGrid"}):
        table = d.find('table', attrs={'class':'rdThemeDataTable table-sm table-hover table-bordered border-0'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values

In [34]:
#converts list to dataframe
median_florida= pd.DataFrame (data, columns = ['County','2021','2020','2019','2018','2017','2016','2015','2014','2013','2012'])


In [35]:
del median_florida['2021']
del median_florida['2015']
del median_florida['2014']
del median_florida['2013']
del median_florida['2012']

In [36]:
c=median_florida['County']

In [37]:
del median_florida['County']

In [38]:
median_florida = median_florida.apply(pd.to_numeric, errors='coerce')

In [39]:
avg=median_florida.mean(axis=1)

In [40]:
median_florida['Avg Age']=avg
median_florida['County']=c
median_florida=median_florida.set_index('County')

In [41]:
median_florida=median_florida[1:]

In [42]:
median_florida.sort_values('Avg Age', ascending=False)

Unnamed: 0_level_0,2020,2019,2018,2017,2016,Avg Age
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Sumter,68.0,67.4,67.0,66.4,66.0,66.96
Charlotte,59.5,59.1,58.6,58.1,57.7,58.60
Citrus,56.7,56.5,56.2,55.9,55.7,56.20
Sarasota,56.6,56.1,55.5,55.1,54.5,55.56
Highlands,54.1,53.5,53.4,53.1,52.6,53.34
...,...,...,...,...,...,...
Hardee,35.4,35.4,35.1,34.9,34.3,35.02
Orange,35.3,35.1,34.9,34.7,34.5,34.90
Hendry,35.0,34.0,33.9,33.7,33.8,34.08
Alachua,31.7,31.6,31.3,31.2,31.0,31.36


In [43]:
max_florida.loc[max_florida['Max Column']=='Breast']

Unnamed: 0,County,Max Column
2,Alachua,Breast
7,Broward,Breast
12,Collier,Breast
16,Duval,Breast
29,Hillsborough,Breast
37,Leon,Breast
41,Manatee,Breast
44,Miami-Dade,Breast
49,Orange,Breast
50,Osceola,Breast


In [44]:
florida.loc[florida['County']=='Sumter']

Unnamed: 0,County,Cancers,Bronchus,Prostate,Breast,Colorectal,Bladder,Neck,Hodgkin,Melanoma,Ovary,Cervix
61,Sumter,7112,1081,879,1107,457,418,250,264,256,84,0
