### Web Scraping the Median Income Dataset

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the webpage
url = 'https://www.laalmanac.com/employment/em12c.php'

# Send a GET request to the webpage
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')


In [3]:
soup

<!DOCTYPE html>

<html lang="en-US">
<head>
<!-- Global Site Tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=GA_TRACKING_ID"></script>
<script>
        window.dataLayer = window.dataLayer || [];
        function gtag(){dataLayer.push(arguments);}
        gtag('js', new Date());

        gtag('config', 'UA-31515633-1');
      </script>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>Median Household Income By Zip Code in Los Angeles County, California</title>
<meta content="Almanac facts, information and trivia about Los Angeles County, its people, cities and communities." name="description"/>
<meta content="Los Angeles, Los Angeles County, income, households, almanac, Agoura Hills, Alhambra, Arcadia, Artesia, Avalon, Azusa, Baldwin Park, Bell, Bellflower, Bell Gardens, Beverly Hills, Bradbury, Burbank, Calabasas, Carson, Cerritos, Claremont, Commerce, Compton, Covina, Cudahy, Cu

In [4]:
table = soup.find('table')

In [5]:
table

<table>
<col class="column-10"/>
<col class="column-73"/>
<col class="column-17"/>
<tr>
<td class="text-left">90001</td>
<td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (South Los Angeles), Florence-Graham</td>
<td>$57,698</td>
</tr>
<tr>
<td class="text-left">90002</td>
<td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (Southeast Los Angeles, Watts)</td>
<td>$54,221</td>
</tr>
<tr>
<td class="text-left">90003</td>
<td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (South Los Angeles, Southeast Los Angeles)</td>
<td>$51,275</td>
</tr>
<tr>
<td class="text-left">90004</td>
<td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (Hancock Park, Rampart Village, Virgil Village, Wilshire Center, Windsor Square)</td>
<td>$58,420</td>
</tr>
<tr>
<td class="text-left">90005</td>
<td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los 

In [6]:
rows = table.find_all('tr')

In [7]:
rows

[<tr>
 <td class="text-left">90001</td>
 <td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (South Los Angeles), Florence-Graham</td>
 <td>$57,698</td>
 </tr>,
 <tr>
 <td class="text-left">90002</td>
 <td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (Southeast Los Angeles, Watts)</td>
 <td>$54,221</td>
 </tr>,
 <tr>
 <td class="text-left">90003</td>
 <td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (South Los Angeles, Southeast Los Angeles)</td>
 <td>$51,275</td>
 </tr>,
 <tr>
 <td class="text-left">90004</td>
 <td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (Hancock Park, Rampart Village, Virgil Village, Wilshire Center, Windsor Square)</td>
 <td>$58,420</td>
 </tr>,
 <tr>
 <td class="text-left">90005</td>
 <td class="text-left"><a href="https://www.laalmanac.com/LA/index.php">Los Angeles</a> (Hancock Park, Koreatown, Wilshire Center, W

In [8]:
headers = ['Zip Code', 'Community', '2022 Estimated Median Income']

# Extract rows from the table
rows = []
for row in table.find_all('tr')[0:]:  # Skip the header row
    cols = row.find_all('td')
    cols = [ele.get_text(strip=True) for ele in cols]
    rows.append(cols)

In [9]:
df = pd.DataFrame(rows, columns=headers)

In [10]:
df

Unnamed: 0,Zip Code,Community,2022 Estimated Median Income
0,90001,"Los Angeles(South Los Angeles), Florence-Graham","$57,698"
1,90002,"Los Angeles(Southeast Los Angeles, Watts)","$54,221"
2,90003,"Los Angeles(South Los Angeles, Southeast Los A...","$51,275"
3,90004,"Los Angeles(Hancock Park, Rampart Village, Vir...","$58,420"
4,90005,"Los Angeles(Hancock Park, Koreatown, Wilshire ...","$49,226"
...,...,...,...
278,93551,"City Ranch,Leona Valley,Palmdale","$107,133"
279,93552,Palmdale,"$85,890"
280,93553,"Juniper Hills, Pearblossom","$71,587"
281,93563,"Pearblossom, Valyermo","No 2022 estimate ($210,600 in 2021)†"


In [11]:
df.to_csv("LA_2022_median_salary.csv",index=False)

### Web Scraping for Population by Race Dataset

In [12]:
# URL of the webpage
url = "https://www.laalmanac.com/population/po24la_zip.php"

# Send a GET request to the webpage
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

In [13]:
table = soup.find('table')

In [14]:
table

<table>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<col class="column-10"/>
<thead>
<tr>
<th class="text-left" rowspan="2">Zip Code</th>
<th rowspan="2">Total Population, All Races</th>
<th class="text-left" colspan="6">Population of One Race Alone <sup>†</sup></th>
<th rowspan="2">Two or More Races</th>
<th rowspan="2">Hispanic or Latino</th>
</tr>
<tr>
<th>American Indian &amp; Alaska Native</th>
<th>Asian</th>
<th>Black or African American</th>
<th>Native Hawaiian &amp; Other Pacific Islander</th>
<th>White (Not Hispanic or Latino)</th>
<th>Some Other Race</th>
</tr>
</thead>
<tbody>
<tr>
<td class="text-left"><a href="https://www.laalmanac.com/LA/index.php"><b>City of L.A.</b></a></td>
<td>3,881,041</td>
<td>40,695<br>
1.05%</br></td>
<td>458,721<br>
11.82%</br></td>
<td>332,173<br/>
8.56%</td>
<td>5,70

In [15]:
races = [
    "American Indian & Alaska Native", "Asian", "Black or African American",
    "Native Hawaiian & Other Pacific Islander", "White (Not Hispanic or Latino)",
    "Some Other Race", "Two or More Races", "Hispanic or Latino"
]

# Initialize a list to store the extracted data
data = []

# Find the table body
tbody = soup.find('tbody')

# Iterate over each row in the table body
for row in tbody.find_all('tr'):
    columns = row.find_all('td')
    zip_code = columns[0].text.strip()  # Extract Zip Code
    total_population = columns[1].text.strip()  # Extract Total Population

    # Extract race-wise population and percentages
    race_data = {}
    for i, race in enumerate(races, start=2):
        race_info = columns[i].text.split("<br/>")
        population = race_info[0].strip()
        percentage = race_info[1].strip() if len(race_info) > 1 else None
        race_data[race] = {"Population": population, "Percentage": percentage}

    # Append the extracted data to the list
    data.append({"Zip Code": zip_code, "Total Population": total_population, **race_data})

# Convert the list to a pandas DataFrame for easier manipulation
df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,Zip Code,Total Population,American Indian & Alaska Native,Asian,Black or African American,Native Hawaiian & Other Pacific Islander,White (Not Hispanic or Latino),Some Other Race,Two or More Races,Hispanic or Latino
0,City of L.A.,3881041,"{'Population': '40,695 1.05%', 'Percentage': ...","{'Population': '458,721 11.82%', 'Percentage'...","{'Population': '332,173 8.56%', 'Percentage':...","{'Population': '5,709 0.15%', 'Percentage': N...","{'Population': '1,069,141 27.55%', 'Percentag...","{'Population': '952,377 24.54%', 'Percentage'...","{'Population': '493,491 12.72%', 'Percentage'...","{'Population': '1,828,315 47.11%', 'Percentag..."
1,90001,57652,"{'Population': '1,226 2.13%', 'Percentage': N...","{'Population': '348 0.60%', 'Percentage': None}","{'Population': '4,295 7.45%', 'Percentage': N...","{'Population': '13 0.02%', 'Percentage': None}","{'Population': '373 0.65%', 'Percentage': None}","{'Population': '23,449 40.67%', 'Percentage':...","{'Population': '12,037 20.88%', 'Percentage':...","{'Population': '52,642 91.31%', 'Percentage':..."
2,90002,53108,"{'Population': '1,974 3.72%', 'Percentage': N...","{'Population': '611 1.15%', 'Percentage': None}","{'Population': '8,355 15.73%', 'Percentage': ...","{'Population': '53 0.10%', 'Percentage': None}","{'Population': '200 0.38%', 'Percentage': None}","{'Population': '20,939 39.43%', 'Percentage':...","{'Population': '8,870 16.70%', 'Percentage': ...","{'Population': '43,766 82.41%', 'Percentage':..."
3,90003,75024,"{'Population': '598 0.80%', 'Percentage': None}","{'Population': '350 0.47%', 'Percentage': None}","{'Population': '12,352 16.46%', 'Percentage':...","{'Population': '0 0.00%', 'Percentage': None}","{'Population': '354 0.47%', 'Percentage': None}","{'Population': '35,695 47.58%', 'Percentage':...","{'Population': '13,140 17.51%', 'Percentage':...","{'Population': '61,179 81.55%', 'Percentage':..."
4,90004,58833,"{'Population': '741 1.26%', 'Percentage': None}","{'Population': '14,584 24.79%', 'Percentage':...","{'Population': '2,619 4.45%', 'Percentage': N...","{'Population': '68 0.12%', 'Percentage': None}","{'Population': '12,108 20.58%', 'Percentage':...","{'Population': '18,307 31.12%', 'Percentage':...","{'Population': '5,495 9.34%', 'Percentage': N...","{'Population': '27,410 46.59%', 'Percentage':..."


The above scraped data is not in the best condition to understand or perform further EDA tasks. Therefore we will modify our scraping function to obtain a more legible data set.

In [17]:

# Step 1: Fetch the webpage content
url = "https://www.laalmanac.com/population/po24la_zip.php"
response = requests.get(url)
html_content = response.text

# Step 2: Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Step 3: Define the races of interest
races = [
    "American Indian & Alaska Native", "Asian", "Black or African American",
    "Native Hawaiian & Other Pacific Islander", "White (Not Hispanic or Latino)",
    "Some Other Race", "Two or More Races", "Hispanic or Latino"
]

# Step 4: Initialize a list to store the extracted data
data = []

# Step 5: Find all table bodies in the HTML
tbodies = soup.find_all('tbody')

# Step 6: Iterate over each tbody in the list
for tbody in tbodies:
    # Step 7: Iterate over each row in the current tbody
    for row in tbody.find_all('tr'):
        columns = row.find_all('td')
        if len(columns) == 0:
            continue  # Skip any empty rows

        zip_code = columns[0].text.strip()  # Extract Zip Code
        total_population = columns[1].text.strip()  # Extract Total Population

        # Extract race-wise population and percentages
        race_data = {}
        for i, race in enumerate(races, start=2):
            race_info = columns[i].get_text(separator=" ").split()
            population = race_info[0].strip()
            percentage = race_info[1].strip() if len(race_info) > 1 else None
            race_data[race] =  percentage

        # Append the extracted data to the list
        data.append({"Zip Code": zip_code, "Total Population": total_population, **race_data})

# Step 8: Convert the list to a pandas DataFrame for easier manipulation
df = pd.DataFrame(data)



In [18]:
df

Unnamed: 0,Zip Code,Total Population,American Indian & Alaska Native,Asian,Black or African American,Native Hawaiian & Other Pacific Islander,White (Not Hispanic or Latino),Some Other Race,Two or More Races,Hispanic or Latino
0,City of L.A.,3881041,1.05%,11.82%,8.56%,0.15%,27.55%,24.54%,12.72%,47.11%
1,90001,57652,2.13%,0.60%,7.45%,0.02%,0.65%,40.67%,20.88%,91.31%
2,90002,53108,3.72%,1.15%,15.73%,0.10%,0.38%,39.43%,16.70%,82.41%
3,90003,75024,0.80%,0.47%,16.46%,0.00%,0.47%,47.58%,17.51%,81.55%
4,90004,58833,1.26%,24.79%,4.45%,0.12%,20.58%,31.12%,9.34%,46.59%
...,...,...,...,...,...,...,...,...,...,...
111,91602,19980,0.42%,9.60%,7.17%,0.45%,59.40%,7.15%,11.94%,18.56%
112,91604,32073,0.29%,8.62%,4.26%,0.00%,69.01%,2.94%,11.05%,11.90%
113,91605,51654,1.46%,8.47%,2.03%,0.04%,27.24%,27.84%,16.24%,60.52%
114,91606,43552,1.29%,3.95%,5.01%,0.00%,33.76%,23.71%,14.95%,53.65%


In [19]:
columns_to_clean = df.columns[2:]  # Skip 'Zip Code' and 'Total Population'
df[columns_to_clean] = df[columns_to_clean].apply(lambda x: x.str.replace('%', '', regex=False))

# Convert the cleaned columns to numeric types (optional, if needed)
df[columns_to_clean] = df[columns_to_clean].apply(pd.to_numeric)

In [20]:
df

Unnamed: 0,Zip Code,Total Population,American Indian & Alaska Native,Asian,Black or African American,Native Hawaiian & Other Pacific Islander,White (Not Hispanic or Latino),Some Other Race,Two or More Races,Hispanic or Latino
0,City of L.A.,3881041,1.05,11.82,8.56,0.15,27.55,24.54,12.72,47.11
1,90001,57652,2.13,0.60,7.45,0.02,0.65,40.67,20.88,91.31
2,90002,53108,3.72,1.15,15.73,0.10,0.38,39.43,16.70,82.41
3,90003,75024,0.80,0.47,16.46,0.00,0.47,47.58,17.51,81.55
4,90004,58833,1.26,24.79,4.45,0.12,20.58,31.12,9.34,46.59
...,...,...,...,...,...,...,...,...,...,...
111,91602,19980,0.42,9.60,7.17,0.45,59.40,7.15,11.94,18.56
112,91604,32073,0.29,8.62,4.26,0.00,69.01,2.94,11.05,11.90
113,91605,51654,1.46,8.47,2.03,0.04,27.24,27.84,16.24,60.52
114,91606,43552,1.29,3.95,5.01,0.00,33.76,23.71,14.95,53.65


In [21]:
df.to_csv("LA_race_population.csv",index=False)