# Initial Visualization of data plus data preparation

## Dataset

### Dataset link: https://data.census.gov/table/ACSST1Y2024.S0502?q=immigration

In [2]:
import pandas as pd
import numpy as np
import altair as alt

immigration_df = pd.read_csv("data/immigration.csv",
    encoding="utf-8",
    thousands=",",
    na_values=["(X)"])

immigration_df.head(10)

  from pandas.core import (


Unnamed: 0,Label (Grouping),United States!!Total!!Estimate,United States!!Total!!Margin of Error,United States!!Foreign-born; Entered 2010 or later!!Estimate,United States!!Foreign-born; Entered 2010 or later!!Margin of Error,United States!!Foreign-born; Entered 2000 to 2009!!Estimate,United States!!Foreign-born; Entered 2000 to 2009!!Margin of Error,United States!!Foreign-born; Entered before 2000!!Estimate,United States!!Foreign-born; Entered before 2000!!Margin of Error
0,Foreign-born population,50234858,"±162,664",18950166,"±141,306",10604525,"±72,856",20680167,"±82,679"
1,CITIZENSHIP,,,,,,,,
2,Naturalized citizen,51.4%,±0.2,22.6%,±0.3,55.3%,±0.4,75.9%,±0.2
3,Not a citizen,48.6%,±0.2,77.4%,±0.3,44.7%,±0.4,24.1%,±0.2
4,WORLD REGION OF BIRTH OF FOREIGN-BORN,,,,,,,,
5,Foreign-born population excluding popu...,50234841,"±162,666",18950166,"±141,306",10604525,"±72,856",20680150,"±82,677"
6,Europe,9.7%,±0.1,7.5%,±0.2,8.2%,±0.2,12.4%,±0.1
7,Asia,30.8%,±0.1,32.5%,±0.3,29.7%,±0.3,29.8%,±0.2
8,Africa,5.9%,±0.1,8.3%,±0.2,6.6%,±0.2,3.4%,±0.1
9,Oceania,0.6%,±0.1,0.7%,±0.1,0.5%,±0.1,0.6%,±0.1


## Data Transformations

In [13]:
race_df = immigration_df[immigration_df["Label (Grouping)"].isin(['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Europe', '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Asia', '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Africa', '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Oceania', '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Latin America', '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Northern America'])]
race_df = race_df[["Label (Grouping)", "United States!!Total!!Estimate"]]
race_df["United States!!Total!!Estimate"] = race_df["United States!!Total!!Estimate"].apply(lambda x: float(x.replace("%", "")))
race_df

Unnamed: 0,Label (Grouping),United States!!Total!!Estimate
6,Europe,9.7
7,Asia,30.8
8,Africa,5.9
9,Oceania,0.6
10,Latin America,51.4
11,Northern America,1.7


In [14]:
gender = immigration_df[immigration_df["Label (Grouping)"].isin(['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Male',
                                                                 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Female'
                                                                 ])]
gender = gender[["Label (Grouping)",
             "United States!!Total!!Estimate",
             "United States!!Foreign-born; Entered 2010 or later!!Estimate",
             "United States!!Foreign-born; Entered 2000 to 2009!!Estimate",
             "United States!!Foreign-born; Entered before 2000!!Estimate"]]
gender.columns = (gender.columns
                .str.replace("!!Estimate", "", regex=False)
                .str.replace("!!", " ", regex=False)
                .str.replace("(Grouping)", "", regex=False)
                .str.replace("United States Foreign-born;", "", regex=False)
                .str.strip())
gender["United States Total"] = gender["United States Total"].apply(lambda x: float(x.replace("%", "")))
gender

Unnamed: 0,Label,United States Total,Entered 2010 or later,Entered 2000 to 2009,Entered before 2000
13,Male,49.0,49.9%,48.8%,48.3%
14,Female,51.0,50.1%,51.2%,51.7%


In [15]:
age = immigration_df[immigration_df["Label (Grouping)"].isin(['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Under 5 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0to 17 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa018 to 24 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa025 to 44 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa045 to 54 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa055 to 64 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa065 to 74 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa075 to 84 years',
                                                                         '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa085 years and over',
                                                                         ])]
age = age[["Label (Grouping)",
             "United States!!Total!!Estimate",
             "United States!!Foreign-born; Entered 2010 or later!!Estimate",
             "United States!!Foreign-born; Entered 2000 to 2009!!Estimate",
             "United States!!Foreign-born; Entered before 2000!!Estimate"]]
age.columns = (age.columns
                .str.replace("!!Estimate", "", regex=False)
                .str.replace("!!", " ", regex=False)
                .str.replace("(Grouping)", "", regex=False)
                .str.replace("United States Foreign-born;", "", regex=False)
                .str.strip())
age["United States Total"] = age["United States Total"].apply(lambda x: float(x.replace("%", "")))

age

Unnamed: 0,Label,United States Total,Entered 2010 or later,Entered 2000 to 2009,Entered before 2000
15,Under 5 years,0.8,2.1%,,
17,18 to 24 years,5.8,11.7%,6.6%,0.0%
18,25 to 44 years,34.6,49.4%,45.9%,15.2%
19,45 to 54 years,19.4,11.9%,26.3%,22.6%
20,55 to 64 years,15.8,5.9%,12.5%,26.6%
21,65 to 74 years,10.5,3.2%,5.1%,20.1%
22,75 to 84 years,5.6,1.4%,2.0%,11.4%
23,85 years and over,2.0,0.4%,0.6%,4.1%


In [16]:
race = immigration_df[immigration_df["Label (Grouping)"].isin(['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0One race',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0White',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Black or African American',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0American Indian or Alaska Native',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Asian',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Native Hawaiian and Other Pacific Islander',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Some other race',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Two or more races',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Hispanic or Latino origin (of any race)',
                                                               '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0White alone, not Hispanic or Latino'
                                                               ])]
race = race[["Label (Grouping)",
             "United States!!Total!!Estimate",
             "United States!!Foreign-born; Entered 2010 or later!!Estimate",
             "United States!!Foreign-born; Entered 2000 to 2009!!Estimate",
             "United States!!Foreign-born; Entered before 2000!!Estimate"]]
race.columns = (race.columns
                .str.replace("!!Estimate", "", regex=False)
                .str.replace("!!", " ", regex=False)
                .str.replace("(Grouping)", "", regex=False)
                .str.replace("United States Foreign-born;", "", regex=False)
                .str.strip())
race["United States Total"] = race["United States Total"].apply(lambda x: float(x.replace("%", "")))

race

Unnamed: 0,Label,United States Total,Entered 2010 or later,Entered 2000 to 2009,Entered before 2000
26,One race,77.8,77.6%,77.7%,78.1%
27,White,19.6,17.3%,17.7%,22.6%
28,Black or African American,9.4,11.5%,9.4%,7.4%
30,Asian,26.9,27.9%,26.5%,26.3%
31,Native Hawaiian and Other Pacific ...,0.3,0.3%,0.3%,0.3%
33,Two or more races,22.2,22.4%,22.3%,21.9%
34,Hispanic or Latino origin (of any race),45.1,43.0%,48.2%,45.5%
35,"White alone, not Hispanic or Latino",15.5,13.8%,13.2%,18.2%


## Visualization

In [17]:
age_chart = alt.Chart(
    age,
    title="Percentage of Total U.S. Migrants by Age Group"
).mark_bar().encode(
    x= alt.X("Label:N", title="Age Group"),
    y= alt.Y("United States Total:Q", title="Percent of total (%)"),
    color = alt.Color("Label:N")
)

race_chart = alt.Chart(
    race,
    title="Percentage of Total U.S. Migrants by Race"
).mark_bar().encode(
    x= alt.X("Label:N", title="Race"),
    y= alt.Y("United States Total:Q", title="Percent of total (%)"),
    color = alt.Color("Label:N")
)

age_chart | race_chart