In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from lxml import html
import re

url = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"

res = requests.get(url)

soup = BeautifulSoup(res.content, "lxml")

table = soup.find_all("table")[4]

df = pd.read_html(str(table))[0]

df = df.rename(columns = {0: "2018 rank", 1: "City", 2: "State", 3: "2018 estimate", 4: "2010 Census", 5: "Change", 6: "2016 land area (imperial)", \
                     7: "2016 land area (metric)", 8: "2016 population density (imperial)", 9: "2016 population density (metric)", 10: "Location"})

df = df.replace({"New York City": "New York", "Washington, D.C.": "Washington", "Nashville": "Nashville Davidson"})

df = df.drop(index = 0)

df = df.reset_index(drop = True)

df = df.replace("(\\[.*?\\])", "", regex = True)

df = df.replace({"New York City": "New York", "Washington, D.C.": "Washington", "Nashville": "Nashville Davidson"})

data_url = "http://www.city-data.com/city/"
invalid = []
healthdata = pd.DataFrame()

for i in range(5):
    city =  df.iloc[i,1]
    state = df.iloc[i,2]
    city_url = data_url + str(city).replace(" ", "-") + "-" + str(state).replace(" ", "-") + ".html"

    city_data = requests.get(city_url)
    city_soup = BeautifulSoup(city_data.content, "lxml")
    elem = "[<h2>Error 404</h2>]"
    if elem == str(city_soup.find_all("h2")):
        print("Invalid url")
        invalid.append(city)
    else:
        print(f"The data of {city}, {state} has been found.")
        page=requests.Session().get(city_url) 
        tree=html.fromstring(page.text)
        try:
            grocery = tree.xpath('//section[@class="food-environment"]//div//b//text()')[1].replace(",", "")
            grocery_num = re.search('(\\d+)', grocery).group(1)
            obesity_rate = tree.xpath('//section[@class="food-environment"]//div[7]//table//text()')[2]
            healthdata = healthdata.append({"City": city, "grocery number": grocery_num, "obesity rate": obesity_rate}, ignore_index=True)
            print("done")
        except:
            healthdata = healthdata.append({"grocery number": 'NaN', "obesity rate": 'NaN'}, ignore_index=True)

            
df = pd.merge(df, healthdata, how = "inner", on = "City")
df = df.set_index("2018 rank")

df.head()

The data of New York, New York has been found.
done
The data of Los Angeles, California has been found.
done
The data of Chicago, Illinois has been found.
done
The data of Houston, Texas has been found.
done
The data of Phoenix, Arizona has been found.
done


Unnamed: 0_level_0,City,State,2018 estimate,2010 Census,Change,2016 land area (imperial),2016 land area (metric),2016 population density (imperial),2016 population density (metric),Location,grocery number,obesity rate
2018 rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,New York,New York,8398748,8175133,+2.74%,301.5 sq mi,780.9 km2,"28,317/sq mi","10,933/km2",40°39′49″N 73°56′19″W﻿ / ﻿40.6635°N 73.9387°W,1161,15.4%
2,Los Angeles,California,3990456,3792621,+5.22%,468.7 sq mi,"1,213.9 km2","8,484/sq mi","3,276/km2",34°01′10″N 118°24′39″W﻿ / ﻿34.0194°N 118.4108°W,2084,20.4%
3,Chicago,Illinois,2705994,2695598,+0.39%,227.3 sq mi,588.7 km2,"11,900/sq mi","4,600/km2",41°50′15″N 87°40′54″W﻿ / ﻿41.8376°N 87.6818°W,1460,23.5%
4,Houston,Texas,2325502,2100263,+10.72%,637.5 sq mi,"1,651.1 km2","3,613/sq mi","1,395/km2",29°47′12″N 95°23′27″W﻿ / ﻿29.7866°N 95.3909°W,737,26.0%
5,Phoenix,Arizona,1660272,1445632,+14.85%,517.6 sq mi,"1,340.6 km2","3,120/sq mi","1,200/km2",33°34′20″N 112°05′24″W﻿ / ﻿33.5722°N 112.0901°W,501,22.5%
