# Import and install packages

In [1]:
#import packages
import tabula
import pandas as pd
import time
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup as bs
import plotly.express as px
import plotly.figure_factory as ff
import plotly as py
import plotly.graph_objects as go
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install --upgrade plotly
!pip install --upgrade geopandas
!pip install --upgrade pyshp
!pip install --upgrade shapely

# Counts of Cancer per US State 2016-2020

In [2]:
#Cancer Occurence per US State

url ="https://acsjournals.onlinelibrary.wiley.com/doi/10.3322/caac.21763"


async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    #print(soup)

    #uses beautiful soup package to go through website and extract out table
    #appends data to list
    data = []
    m= soup.find(id="caac21763-tbl-0002", class_="article-table-content") 
    for d in m.find_all('div',{'class':"article-table-content-wrapper"}):
        table = m.find('table', attrs={'class':'table article-section__table'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values
    


In [11]:
#converts list to dataframe
df = pd.DataFrame (data, columns = ['State', 'All sites', 'Female breast', 'Colon & rectum', 'Leukemia', 'Lung & bronchus', 'Melanoma of the skin', 'Non-Hodgkin lymphoma'
                                   ,'Prostate', 'Urinary bladder', 'Uterine cervix', 'Uterine corpus'])



In [12]:
#replaces value
df=df.replace('—b', '< 50')

#top 3 cancers in USA:
    #1. Breast
    #2. Prostate
    #3. Lung & Bronchus

#gets rid of last row
states = df.iloc[:-1 , :]
#replace char
states= states.replace(',','', regex=True)

#makes columns type int
c=0
for i in states:
    if(c!=0 and c!=10):
        states[i]=states[i].astype('Int64')
    c+=1
        

In [None]:
df1=states.copy()
del df1['State']
del df1['All sites']

In [None]:
dfsum = df1.sum().to_frame().reset_index()

In [None]:
states

In [None]:
states['All sites'].sum()

In [None]:
dfsum

In [57]:
df.to_excel('unitedstates/Cancer in USA 2016-2020.xlsx')

## Population of US States 2020

In [3]:
#POPULATION OF US STATES 2020

url = "https://ballotpedia.org/United_States_census,_2020"



async with async_playwright() as p:
    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    #print(soup)

    #uses beautiful soup package to go through website and extract out table
    #appends data to list
    data = []
    
    for d in soup.find_all('div',{'class':"mw-parser-output"}):
        table = d.find('table', attrs={'class':'bptable collapsible sortable jquery-tablesorter'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values

In [4]:
#converts list to dataframe
popstates2020 = pd.DataFrame (data, columns = ['State', 'Population','A','B','C'])
#delete columns
del popstates2020['A']
del popstates2020['B']
del popstates2020['C']
#replace char and convert the column of type int
popstates2020['Population'] = popstates2020['Population'].str.replace(',', '').astype(int)
#resets index and deletes column
popstates2020=popstates2020.reset_index()
del popstates2020['index']

## Choropleth Maps for all US States

In [5]:
#US codes w/ D.C
code=['AL', 'AK','AZ', 'AR', 'CA' ,'CO', 'CT','DC', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 
     'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

In [6]:
#US codes w/o D.C
code2=['AL', 'AK','AZ', 'AR', 'CA' ,'CO', 'CT','DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 
     'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

### 2020 Population choropleth maps

In [15]:
#heatmap for 2020 population per state
fig = px.choropleth(popstates2020, locations=code2,
                    locationmode="USA-states", color='Population',color_continuous_scale="Viridis", scope="usa", title='United States Population')
#on click of state goes to heatmap for that state and its county
#fig.update_layout(width=1000, height=600)
fig.write_html('unitedstates/United States Population.html')
fig.show()

In [17]:
fig.write_image("unitedstates/United States Population.png")

### Cancer per US State choropleth map 

In [13]:
#heatmap for number of cancer per state
fig = px.choropleth(states, locations=code,
                    locationmode="USA-states", color='All sites',color_continuous_scale="Viridis", scope="usa", title='Cancer per State from 2016-2020')
#on click of state goes to heatmap for that state and its county
fig.show()
fig.write_html('unitedstates/Cancer per State from 2016-2020.html')

In [14]:
fig.write_image("unitedstates/Cancer per State from 2016-2020.png")

In [13]:
#https://cancerstatisticscenter.cancer.org/#!/state/New%20York

# Annual Cancer per County in NY 

In [6]:
#read in pdf of annual cancer cases per county in nys
df=tabula.read_pdf("newyork/volume1.pdf", pages="all")  
#pages of pdfs



In [8]:
# extracts data for given borough
def extract(i,name):
    nys=df[i].iloc[:, [0, 1]]
    nys=nys[3:]
    nys.columns = nys.iloc[0]
    nys = nys[1:]
    nys.rename(columns={nys.columns[1]: name},inplace=True)
    site=nys.iloc[:, [0]]
    nys=nys[name].str.split(' ', expand=True)
    nys['Site of Cancer']=site
    del nys[1]
    del nys[2]
    nys=nys.rename(columns={0:name})
    first_column = nys.pop('Site of Cancer')
    nys.insert(0, 'Site of Cancer', first_column)
    return nys
        
        
            

In [9]:
#list of county names
county_name=['Bronx','Brooklyn','Manhattan','Queens','Staten Island','Albany','Allegany','Broome','Cattaraugus','Cayuga','Chautauqua','Chemung','Chenago','Clinton','Columbia','Cortland','Delaware','Dutchess','Erie','Essex','Franklin','Fulton','Genesee','Greene','Hamilton','Herkimer','Jefferson','Lewis','Livingston','Madison','Monroe','Montgomery','Nassau','Niagra','Oneida','Onondaga','Ontario','Orange','Orleans','Oswego','Otsego','Putnam','Rensselaer','Rockland','St. Lawrence','Saratoga','Schenectady','Schoharie','Schuyler','Seneca','Steuben','Suffolk','Sullivan','Tioga','Tompkins','Ulster','Warren','Washington','Wayne','Westchester','Wyoming','Yates']
#merge dfs together
i=3
j=0
while(i<65):
    c_county=extract(i,county_name[j])
    if(j>0):
        county=pd.merge(county, c_county, on=['Site of Cancer'])
    else:
        county=c_county
    i+=1
    j+=1

In [12]:
#Tranposes df (changes rows to columns)
county_NY=county.T
county_NY.columns = county_NY.iloc[0]
county_NY = county_NY[1:]

#Reset index and rename columns
county_NY=county_NY.reset_index()
county_NY=county_NY.rename(columns={'index':'County'})

In [13]:
#converts data to float
county_NY=county_NY.set_index('County')
for i in county_NY.columns:
    county_NY[i] = county_NY[i].str.replace(',', '')
    county_NY[i]=county_NY[i].astype(float)
    
county_NY=county_NY.reset_index()

In [30]:
county_NY.to_excel("newyork/New York Cancer Incidences from 2016-2020.xlsx")

In [7]:
#POPULATION DATA

#uses beautiful soup package to go through website and extract out table
#appends data to list
url='https://www.health.ny.gov/statistics/cancer/registry/appendix/countypop.htm'
data=[]
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await page.goto(url)
    time.sleep(6)
    html = await page.content()
    soup = bs(html, 'html.parser')
    #print(soup)
    for d in soup.find_all('div',{'id':"content"}):
        table = d.find('table', attrs={'class':'light_table right'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele]) # Get rid of empty values


In [8]:
#slice list
d1=data[3:]
#converts list to dataframe
pop= pd.DataFrame (d1, columns = ['Males','Female','Total Population'])
county_name=['Bronx','Brooklyn','Manhattan','Queens','Staten Island','Albany','Allegany','Broome','Cattaraugus','Cayuga','Chautauqua','Chemung','Chenago','Clinton','Columbia','Cortland','Delaware','Dutchess','Erie','Essex','Franklin','Fulton','Genesee','Greene','Hamilton','Herkimer','Jefferson','Lewis','Livingston','Madison','Monroe','Montgomery','Nassau','Niagra','Oneida','Onondaga','Ontario','Orange','Orleans','Oswego','Otsego','Putnam','Rensselaer','Rockland','St. Lawrence','Saratoga','Schenectady','Schoharie','Schuyler','Seneca','Steuben','Suffolk','Sullivan','Tioga','Tompkins','Ulster','Warren','Washington','Wayne','Westchester','Wyoming','Yates']
#assign list to column in dataframe
pop['Counties']=county_name

In [9]:
#replace columns chars w another char and convert to type int
pop['Males'] = pop['Males'].str.replace(',', '').astype(int)
pop['Female'] = pop['Female'].str.replace(',', '').astype(int)
pop['Total Population'] = pop['Total Population'].str.replace(',', '').astype(int)
#pop['Males']=pop['Males'].astype(float)

#sort values in column by descending
pop=pop.sort_values('Total Population', ascending=False)
#sort values in column by ascending
pop=pop.sort_values('Counties')

# Choropleth Maps

## Cancer per County NY

In [10]:
#Fips of NY counties
NY_fips=['36001','36003','36005','36047','36007','36009','36011','36013','36015','36017','36019','36021','36023','36025','36027','36029','36031','36033',
        '36035','36037','36039','36041','36043','36045','36049','36051','36053','36061','36055','36057','36059','36063','36065','36067',
        '36069','36071','36073','36075','36077','36079','36081','36083','36087','36091','36093','36095','36097','36099','36089','36085','36101',
        '36103','36105','36107','36109','36111','36113','36115','36117','36119','36121','36123']

In [11]:
#heatmap for 2016-2020 cancer per county in NY


county_NY=county_NY.sort_values('County')
#grabs county names
county_val=county_NY['County'].tolist()
values=county_NY['All Invasive Malignant Tumors'].tolist()
endpts = list(np.mgrid[min(values):max(values):6j])
fig =ff.create_choropleth(fips=NY_fips,values=values,scope=["New York"],binning_endpoints=endpts, show_state_data=True,county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},round_legend_values=True,title='Count of Cancer per County in New York from 2016-2020')
#on click of state goes to heatmap for that state and its county
py.offline.plot(fig,
                filename='choropleth_newyork',
                include_plotlyjs='https://cdn.plot.ly/plotly-1.42.3.min.js')

'choropleth_newyork.html'

## Population per County NY

In [12]:
values=pop['Total Population'].tolist()
endpts = list(np.mgrid[min(values):max(values):6j])
#heatmap for 2016-2020 population per county in NY
fig =ff.create_choropleth(fips=NY_fips,values=values,scope=["New York"],binning_endpoints=endpts, show_state_data=True,county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},round_legend_values=True,title='Count of Pop per County in New York from 2016-2020')
#on click of state goes to heatmap for that state and its county
py.offline.plot(fig,
                filename='choropleth_pop_newyork',
                include_plotlyjs='https://cdn.plot.ly/plotly-1.42.3.min.js')

'choropleth_pop_newyork.html'

# Most common cancer for all 62 counties : Lung and Bronchus

In [14]:
max_county_ny=county_NY['County']
county_NY_cluster=county_NY.copy()
del county_NY_cluster['County']
del county_NY_cluster['All Invasive Malignant Tumors']

In [25]:
county_NY_cluster

Site of Cancer,Oral cavity and pharynx,Esophagus,Colorectal,Colon excluding rectum,Rectum & rectosigmoid,Liver / intrahepatic bile duct,Pancreas,Lung and bronchus,Melanoma of the skin,Female breast,...,Ovary,Prostate,Urinary bladder (incl. in situ),Kidney and renal pelvis,Brain and other central nervous system,Thyroid,Hodgkin lymphoma,Non-Hodgkin lymphomas,Myeloma,Leukemias
0,150.4,48.0,521.2,365.4,155.8,218.2,200.8,644.4,58.4,,...,,,193.4,226.8,75.4,203.8,41.2,277.2,171.4,178.0
1,254.8,83.8,1047.2,740.8,306.4,276.4,384.6,1209.0,230.2,,...,,,427.8,404.0,164.4,543.0,82.0,511.4,248.6,351.2
2,217.0,63.8,614.6,421.4,193.2,197.0,263.2,882.4,340.2,,...,,,345.0,238.0,117.2,326.4,51.0,424.8,155.2,274.4
3,249.0,92.0,1000.2,705.2,295.0,275.4,369.2,1190.4,198.4,,...,,,431.6,393.8,132.2,492.4,60.8,492.6,216.0,341.4
4,60.4,28.4,240.6,177.4,63.2,65.2,94.2,384.8,84.2,,...,,,144.4,126.0,36.4,164.4,16.0,138.4,55.0,98.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,12.8,6.4,32.2,21.6,10.6,8.6,14.4,69.4,17.8,,...,,,22.0,15.4,6.6,12.4,0.6,18.2,9.8,13.4
58,16.4,6.6,52.0,35.0,17.0,9.2,17.6,96.0,34.0,,...,,,32.0,28.4,7.2,11.2,2.2,30.0,9.0,21.2
59,117.8,43.6,409.4,291.2,118.2,100.0,193.2,583.8,222.2,,...,,,270.4,188.8,71.8,198.8,31.8,277.8,111.2,225.2
60,6.6,3.2,21.6,12.8,8.8,4.2,9.6,35.8,16.6,,...,,,14.4,7.8,2.4,6.2,1.2,13.2,3.6,10.8


In [15]:
#grabs the most prevalent cancer in all the counties
max_county_ny['Area Site']=county_NY_cluster.idxmax(axis=1)
max_ny_num=max_county_ny.groupby(max_county_ny['Area Site']).count()

In [26]:
max_county_ny

0                                                        Bronx
1                                                     Brooklyn
2                                                    Manhattan
3                                                       Queens
4                                                Staten Island
                                   ...                        
58                                                       Wayne
59                                                 Westchester
60                                                     Wyoming
61                                                       Yates
Area Site    0     Lung and bronchus
1     Lung and bronchu...
Name: County, Length: 63, dtype: object

In [40]:
maxx=county_NY['County'].to_list()
maxx2=county_NY_cluster.idxmax(axis=1).tolist()

In [41]:
max_case=pd.DataFrame()
max_case['County']=maxx
max_case['Cancer Site']=maxx2

In [42]:
max_case.to_excel('Max check.xlsx')

In [31]:
max_ny_num

Lung and bronchus    62
Name: County, dtype: int64

# County with the most type of each cancer

In [27]:
#makes each column float values

county=county.set_index('Site of Cancer')

for i in county.columns:
    county[i] = county[i].str.replace(',', '')
    county[i]=county[i].astype(float)

county=county.reset_index()

In [28]:
#drops indexes
county=county.drop([10,11,12,13,14])
county=county.set_index('Site of Cancer')

In [29]:
#grabs top county that has the most accounts of a certain type of cancer

def top(c):
    # Define the number of top values to return
    n = 1
    i=[]
    t=[]
    # Iterate over each row of the DataFrame
    for index, row in c.iterrows():
        # Get the top-n highest values in the row
        top_n = row.nlargest(n)
        i.append(index)
        t.append(list(top_n.index))


        # Print the names of the top-n highest-value columns
        #print(f'Top column in row {index}: {list(top_n.index)}')
        #print()
    return i,t

    

In [30]:
county

Unnamed: 0_level_0,Bronx,Brooklyn,Manhattan,Queens,Staten Island,Albany,Allegany,Broome,Cattaraugus,Cayuga,...,Sullivan,Tioga,Tompkins,Ulster,Warren,Washington,Wayne,Westchester,Wyoming,Yates
Site of Cancer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
All Invasive Malignant Tumors,6325.0,12018.8,8601.8,11454.4,3041.6,1986.4,299.0,1311.8,527.4,554.6,...,464.8,347.2,494.8,1168.8,515.2,443.2,649.8,5767.4,285.6,180.0
Oral cavity and pharynx,150.4,254.8,217.0,249.0,60.4,57.6,8.0,35.4,16.8,11.4,...,15.0,8.6,12.2,39.6,14.8,12.8,16.4,117.8,6.6,5.8
Esophagus,48.0,83.8,63.8,92.0,28.4,22.6,3.0,15.6,6.2,7.8,...,5.2,4.4,7.6,17.2,8.4,6.4,6.6,43.6,3.2,3.0
Colorectal,521.2,1047.2,614.6,1000.2,240.6,140.0,26.0,92.6,38.8,37.4,...,40.2,29.6,26.6,98.0,32.4,32.2,52.0,409.4,21.6,9.2
Colon excluding rectum,365.4,740.8,421.4,705.2,177.4,102.2,18.6,63.6,26.6,25.8,...,27.4,20.2,19.4,66.0,23.8,21.6,35.0,291.2,12.8,5.6
Rectum & rectosigmoid,155.8,306.4,193.2,295.0,63.2,37.8,7.4,29.0,12.2,11.6,...,12.8,9.4,7.2,32.0,8.6,10.6,17.0,118.2,8.8,3.6
Liver / intrahepatic bile duct,218.2,276.4,197.0,275.4,65.2,37.8,3.6,24.0,6.2,5.8,...,10.0,5.4,7.2,26.6,8.0,8.6,9.2,100.0,4.2,1.8
Pancreas,200.8,384.6,263.2,369.2,94.2,64.0,9.0,45.8,14.2,13.4,...,17.4,7.0,15.6,34.6,12.6,14.4,17.6,193.2,9.6,6.0
Lung and bronchus,644.4,1209.0,882.4,1190.4,384.8,277.6,45.6,170.2,78.2,91.2,...,71.0,44.2,59.2,166.8,76.4,69.4,96.0,583.8,35.8,25.0
Melanoma of the skin,58.4,230.2,340.2,198.4,84.2,87.0,16.6,63.2,28.8,23.4,...,16.6,19.2,26.4,42.4,28.0,17.8,34.0,222.2,16.6,9.8


In [18]:
#call function from above and put it in df

top_cancer_county=pd.DataFrame(columns=['Site','County'])
top_cancer_county['Site'],top_cancer_county['Site of Cancer']=top(county)
del top_cancer_county['County']
top_cancer_county=top_cancer_county.rename(columns={'Site of Cancer':'County'})

#converts list to string
top_cancer_county['County'] = [','.join(map(str, l)) for l in top_cancer_county['County']]
top_cancer_county=top_cancer_county[1:]

In [19]:
top_cancer_county

Unnamed: 0,Site,County
1,Oral cavity and pharynx,Brooklyn
2,Esophagus,Queens
3,Colorectal,Brooklyn
4,Colon excluding rectum,Brooklyn
5,Rectum & rectosigmoid,Brooklyn
6,Liver / intrahepatic bile duct,Brooklyn
7,Pancreas,Brooklyn
8,Lung and bronchus,Suffolk
9,Melanoma of the skin,Suffolk
10,Urinary bladder (incl. in situ),Suffolk


In [20]:
#group by to get counts per county
top_county_per_site=top_cancer_county.groupby(top_cancer_county['County']).count()
print(top_county_per_site)

          Site
County        
Brooklyn    12
Queens       1
Suffolk      4


In [21]:
top_cancer_county.loc[top_cancer_county['County']=='Suffolk']

Unnamed: 0,Site,County
8,Lung and bronchus,Suffolk
9,Melanoma of the skin,Suffolk
10,Urinary bladder (incl. in situ),Suffolk
17,Leukemias,Suffolk


# Florida Extraction

In [22]:
fl1=tabula.read_pdf("florida/Florida 2016 cases.pdf", pages="all")  
fl2=tabula.read_pdf("florida/Florida 2017 cases.pdf", pages="all")  
fl3=tabula.read_pdf("florida/Florida 2018 cases.pdf", pages="all")  
fl4=tabula.read_pdf("florida/Florida 2019 cases.pdf", pages="all")  
fl5=tabula.read_pdf("florida/Florida 2020 cases.pdf", pages="all")  

In [23]:
#grabs data from pdf and put in df
def florida_extract(fl):
    florida=fl[0]
    florida.columns = florida.iloc[0]
    florida = florida[1:]
    florida.rename(columns={florida.columns[0]: 'County'},inplace=True)
    florida=florida[1:]
    florida=florida.replace('^','0')
    #florida['Cancers'] = florida['Cancers'].str.replace(',', '').astype(int)
    return florida

In [24]:
#merges dfs
fl1=florida_extract(fl1)
fl2=florida_extract(fl2)
fl3=florida_extract(fl3)
fl4=florida_extract(fl4)
fl5=florida_extract(fl5)

fl1=fl1.rename(columns={'All Cancers':'Cancers'})

florida=pd.DataFrame()

k=0
for i in fl1.columns:
    if(k!=0):
        fl1[i] = fl1[i].str.replace(',', '').astype(int)
        fl2[i] = fl2[i].str.replace(',', '').astype(int)
        fl3[i] = fl3[i].str.replace(',', '').astype(int)
        fl4[i] = fl4[i].str.replace(',', '').astype(int)
        fl5[i] = fl5[i].str.replace(',', '').astype(int)
        florida[i]=fl1[i]+fl2[i]+fl3[i]+fl4[i]+fl5[i]
    else:
        florida[i]=fl1[i]
    k+=1
    

In [32]:
pop_fl=pd.read_excel('florida/Florida Population 2016-2020.xlsx')

CO-EST2019-ANNRES-12


In [34]:
new_header = pop_fl.iloc[0] #grab the first row for the header
pop_fl = pop_fl[1:] #take the data less the header row
pop_fl.columns = new_header #set the header row as the df header

In [35]:
pop_fl

Unnamed: 0,Florida,2016,2017,2018,2019,Total
1,".Alachua County, Florida",263959,266309,268851,269043,1068162
2,".Baker County, Florida",27884,28254,28353,29210,113701
3,".Bay County, Florida",183634,184736,186240,174705,729315
4,".Bradford County, Florida",26740,27142,27752,28201,109835
5,".Brevard County, Florida",576874,587769,595203,601942,2361788
...,...,...,...,...,...,...
63,".Union County, Florida",15238,15448,15328,15237,61251
64,".Volusia County, Florida",528453,537868,546101,553284,2165706
65,".Wakulla County, Florida",31882,32050,32413,33739,130084
66,".Walton County, Florida",65421,68021,70732,74071,278245


In [25]:
florida.to_csv('Florida Cancer 2016-2020.csv')

# Choropleth Map of Cancer per Florida County

In [39]:
fips=['12001', '12003', '12005', '12007', '12009', '12011', '12013', '12015', '12017', '12019', '12021', '12023','12027', '12029', '12031','12033','12035','12037','12039','12041','12043','12045','12047','12049','12051','12053','12055',
      '12057','12059','12061','12063','12065','12067','12069','12071','12073','12075','12077','12079','12081','12083',
      '12085','12086','12087','12089','12091','12093','12095','12097','12099','12101','12103','12105','12107','12109','12111',
      '12113','12115','12117','12119','12121','12123','12125','12127','12129','12131','12133']

In [27]:
county_fl=florida['County'].tolist()
values=florida['Cancers'].tolist()
endpts = list(np.mgrid[min(values):max(values):6j])
fig =ff.create_choropleth(fips=fips,values=values,scope=["Florida"],binning_endpoints=endpts, show_state_data=True,county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},round_legend_values=True,title='Count of Cancer per County in Florida from 2016-2020')
#on click of state goes to heatmap for that state and its county
py.offline.plot(fig,
                filename='choropleth_florida',
                include_plotlyjs='https://cdn.plot.ly/plotly-1.42.3.min.js')

'choropleth_florida.html'

In [40]:
county_fl=pop_fl['Florida'].tolist()
values=pop_fl['Total'].tolist()
endpts = list(np.mgrid[min(values):max(values):6j])
fig =ff.create_choropleth(fips=fips,values=values,scope=["Florida"],binning_endpoints=endpts, show_state_data=True,county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},round_legend_values=True,title='Population per County in Florida from 2016-2020')
#on click of state goes to heatmap for that state and its county
py.offline.plot(fig,
                filename='choropleth_floridapop',
                include_plotlyjs='https://cdn.plot.ly/plotly-1.42.3.min.js')

'choropleth_floridapop.html'

In [42]:
fig.write_image('florida/Population Florida.png')

In [19]:
cancer_fl=florida.sort_values('Cancers', ascending=False)
cancer_fl.to_excel("florida/Cancer per County.xlsx")
cancer_fl.to_html("florida/Cancer per County.html")


# Showing how many counties cancers are most prevalent in

In [32]:
#create df and fill column with existing data from another df
max_florida=pd.DataFrame(columns=['County'])
max_florida['County']=florida['County']

In [35]:
#creates copy of df
florida_cluster=florida.copy()
del florida_cluster['County']
del florida_cluster['Cancers']

#grabs each cancer site and counts how many counties it's the most prevalent
max_florida['Area Site']=florida_cluster.idxmax(axis=1)
max_florida_num=max_florida.groupby(max_florida['Area Site']).count()

In [40]:
max_florida_num.sort_values('County', ascending=False)

Unnamed: 0_level_0,County
Area Site,Unnamed: 1_level_1
Bronchus,50
Breast,15
Melanoma,1
Prostate,1


In [41]:
max_florida

Unnamed: 0,County,Area Site
2,Alachua,Breast
3,Baker,Bronchus
4,Bay,Bronchus
5,Bradford,Bronchus
6,Brevard,Bronchus
...,...,...
64,Union,Prostate
65,Volusia,Bronchus
66,Wakulla,Bronchus
67,Walton,Bronchus
