In [3]:
import pandas as pd 
import numpy as np
 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

import sys
import os
import dotenv
from utils import load_env_vars

In [4]:
current_dir = os.getcwd()
project_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
src_path = os.path.join(project_dir, 'src')
sys.path.append(src_path)

load_env_vars()

# Importing the Cleaned Dataset

In [5]:
us_immigration_final_csv = os.getenv('processed_us_immigration_csv')
df = pd.read_csv(us_immigration_final_csv)

In [6]:
df.head()

Unnamed: 0,year,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns,president,party,term
0,1980,524295,207116,910361,18013,719211,Jimmy Carter,Democratic,First
1,1981,595014,159252,975780,17379,823875,Ronald Reagan,Republican,First
2,1982,533624,98096,970246,15216,812572,Ronald Reagan,Republican,First
3,1983,550052,61218,1251357,19211,931600,Ronald Reagan,Republican,First
4,1984,541811,70393,1246981,18696,909833,Ronald Reagan,Republican,First


# Basic Exploration

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   year                           41 non-null     int64 
 1   lawful_permanent_resident_obt  41 non-null     int64 
 2   refugee_arrivals               41 non-null     int64 
 3   noncitizen_apprehensions       41 non-null     int64 
 4   noncitizen_removals            41 non-null     int64 
 5   noncitizen_returns             41 non-null     int64 
 6   president                      41 non-null     object
 7   party                          41 non-null     object
 8   term                           41 non-null     object
dtypes: int64(6), object(3)
memory usage: 3.0+ KB


In [7]:
df.describe()

Unnamed: 0,year,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns
count,41.0,41.0,41.0,41.0,41.0,41.0
mean,2000.0,918537.0,75223.658537,1119217.0,182164.804878,862708.8
std,11.979149,281844.8,36474.413578,330125.4,146325.292532,477608.5
min,1980.0,524295.0,11840.0,596560.0,15216.0,100454.0
25%,1990.0,653206.0,56384.0,889212.0,33189.0,471798.0
50%,2000.0,973445.0,69920.0,1094719.0,183114.0,931600.0
75%,2010.0,1062040.0,85285.0,1291065.0,324303.0,1105829.0
max,2020.0,1826595.0,207116.0,1814729.0,432201.0,1675876.0


In [8]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

### Violin Plots

In [30]:
fig = px.violin(df, y='lawful_permanent_resident_obt', points='all', box=True,
                                             hover_data='year', title='Distribution of Lawful Permanent Residents Obtainees')
fig.show(renderer='notebook_connected')

In [31]:
fig = px.violin(df, y='refugee_arrivals', points='all', box=True,
                                    hover_data='year', title='Distribution of Refugee Arivals')
fig.show(renderer='notebook_connected')

In [32]:
fig = px.violin(df, y='noncitizen_apprehensions', points='all', box=True,
                                            hover_data='year', title='Distribution of Noncitizen Apprehensions')
fig.show(renderer='notebook_connected')

In [35]:
fig = px.violin(df, y='noncitizen_removals', points='all', box=True,
                                       hover_data='year', title='Distribution of Noncitizen Removals')
fig.show(renderer='notebook_connected')

In [36]:
fig = px.violin(df, y='noncitizen_returns', points='all', box=True,
                                      hover_data='year', title='Distribution of Noncitizen Returns')
fig.show(renderer='notebook_connected')

### Line PLots

In [None]:
fig = px.line(df, x='year', y='lawful_permanent_resident_obt', 
              title='Lawful Permanent Resident Obtainees by Year')
fig.show(renderer='notebook_connected')

There is a clear outlier in the years 1989-1991. At this time, the Cold War had just ended, and in 1991 the Persian Gulf War began. These major historical events likely caused this peak in lawful permanent resdient obtainees

In [39]:
fig = px.line(df, x='year', y='refugee_arrivals', title='US Refugee Arrivals By Year')
fig.show(renderer='notebook_connected')

From 1983 onwards, refugee arrivals have stayed relatively constant

In [40]:
fig = px.line(df, x='year', y='noncitizen_apprehensions', title='US Noncitizen Apprehensions By Year')
fig.show(renderer='notebook_connected')

Noncitizen apprehensions has had its ups and downs, but has been in a steady decline for the past 20 or so years

In [41]:
fig = px.line(df, x='year', y='noncitizen_removals', title='US Noncitizen Removals By Year')
fig.show(renderer='notebook_connected')

Noncitizen removals stay stable until around 1995, then increase dramatically.

In [42]:
fig = px.line(df, x='year', y='noncitizen_returns', title='US Noncitizen Returns By Year')
fig.show(renderer='notebook_connected')

Noncitizen returns increase fairly steadily up until 2000, then drop dramatically

In [17]:
fig = make_subplots(rows=3, cols=2)
fig.add_trace(go.Scatter(x=df['year'], y=df['lawful_permanent_resident_obt'], 
                         name='Lawful Permanent Resident Obtainees'), row=1, col=1)
fig.add_trace(go.Scatter(x=df['year'], y=df['refugee_arrivals'], 
                     name='Refugee Arrivals'), row=1, col=2)
fig.add_trace(go.Scatter(x=df['year'], y=df['noncitizen_apprehensions'], 
                     name='Noncitizen Apprehensions'), row=2, col=1)
fig.add_trace(go.Scatter(x=df['year'], y=df['noncitizen_removals'], 
                     name='Noncitizen Removals'), row=2, col=2)
fig.add_trace(go.Scatter(x=df['year'], y=df['noncitizen_returns'], 
                     name='Noncitizen Returns'), row=3, col=1)

for trace in fig.data:
    trace.hovertemplate = f'year=%{{x}}<br>{trace.name}=%{{y}}<extra></extra>'
    
fig.update_layout(title_text='Immigration Statistics By Year',
                  height=700)
fig.show(renderer='notebook_connected')

In [115]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['year'], y=df['noncitizen_apprehensions'], name='Noncitizen Apprehensions'))
fig.add_trace(go.Scatter(x=df['year'], y=df['noncitizen_removals'], name='Noncitizen Removals'))
fig.add_trace(go.Scatter(x=df['year'], y=df['noncitizen_returns'], name='Noncitizen Returns'))

for trace in fig.data:
    trace.hovertemplate = f'Year=%{{x}}<br>{trace.name}=%{{y}}<extra></extra>'
fig.update_layout(title='Noncitizen Apprehensions, Removals, and Returns Over Time',
                  xaxis_title='Year',
                  yaxis_title='Quantity')
fig.show(renderer='notebook_connected')

Noncitizen apprehensions and returns move together almost exactly the same up until about 2008, where they diverge. This is near the peak of the rise in noncitizen removals

### Including Categorical Data

#### Grouping By Party

In [61]:
df.head()

Unnamed: 0,year,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns,president,party,term
0,1980,524295,207116,910361,18013,719211,Jimmy Carter,Democratic,First
1,1981,595014,159252,975780,17379,823875,Ronald Reagan,Republican,First
2,1982,533624,98096,970246,15216,812572,Ronald Reagan,Republican,First
3,1983,550052,61218,1251357,19211,931600,Ronald Reagan,Republican,First
4,1984,541811,70393,1246981,18696,909833,Ronald Reagan,Republican,First


In [92]:
target_cols = df.iloc[:, 1:6].columns.to_list()
df_aggregated_mean = df.groupby('party', as_index=False)[target_cols].mean()

fig = make_subplots(rows=3, cols=2)
fig.add_trace(go.Bar(x=df_aggregated_mean['party'], y=df_aggregated_mean['lawful_permanent_resident_obt'], 
                     name='Lawful Permanent Resident Obtainees'), row=1, col=1)
fig.add_trace(go.Bar(x=df_aggregated_mean['party'], y=df_aggregated_mean['refugee_arrivals'], 
                     name='Refugee Arrivals'), row=1, col=2)
fig.add_trace(go.Bar(x=df_aggregated_mean['party'], y=df_aggregated_mean['noncitizen_apprehensions'], 
                     name='Noncitizen Apprehensions'), row=2, col=1)
fig.add_trace(go.Bar(x=df_aggregated_mean['party'], y=df_aggregated_mean['noncitizen_removals'], 
                     name='Noncitizen Removals'), row=2, col=2)
fig.add_trace(go.Bar(x=df_aggregated_mean['party'], y=df_aggregated_mean['noncitizen_returns'], 
                     name='Noncitizen Returns'), row=3, col=1)

for trace in fig.data:
    trace.hovertemplate = f'Party=%{{x}}<br>Average {trace.name}=%{{y}}<extra></extra>'
    
fig.update_layout(title_text='Average Immigration Statistics Across Political Parties')
fig.show(renderer='notebook_connected')

Refugee Arrivals and Noncitizen Removals are the only categories where there is a significant difference bewteen democrats and republicans

#### Grouping By President

In [7]:
# target_cols is all the numerical columns
target_cols = df.iloc[:, 1:6].columns.to_list()
presidents_avg = df.groupby(['president', 'party'], as_index=False)[target_cols].mean()
presidents_avg

Unnamed: 0,president,party,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns
0,Barack Obama,Democratic,1063590.125,69656.375,754004.875,382746.625,273305.0
1,Bill Clinton,Democratic,785061.0,88008.75,1526405.375,108705.75,1427657.375
2,Donald Trump,Republican,990726.25,29463.0,783067.25,299224.0,149748.0
3,George H.W. Bush,Republican,1356521.0,114518.25,1145134.5,35331.5,1005089.25
4,George W. Bush,Republican,1040951.25,47498.5,1157789.375,251567.375,1039538.875
5,Jimmy Carter,Democratic,524295.0,207116.0,910361.0,18013.0,719211.0
6,Ronald Reagan,Republican,578739.0,82477.5,1219893.25,21045.5,1013561.125


The above table shows the average immigration quantities for each president in the time period being studied. Note that Jimmy Carter only has one record in the original dataset while the other presidents have at least 4

In [8]:
presidents_avg_ranks = presidents_avg.copy()
for col in target_cols:
    presidents_avg_ranks[col] = presidents_avg_ranks[col].rank(ascending=False)
presidents_avg_ranks['avg_rank'] = presidents_avg_ranks[target_cols].mean(axis=1)
presidents_avg_ranks

Unnamed: 0,president,party,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns,avg_rank
0,Barack Obama,Democratic,2.0,5.0,7.0,1.0,6.0,4.2
1,Bill Clinton,Democratic,5.0,3.0,1.0,4.0,1.0,2.8
2,Donald Trump,Republican,4.0,7.0,6.0,2.0,7.0,5.2
3,George H.W. Bush,Republican,1.0,2.0,4.0,5.0,4.0,3.2
4,George W. Bush,Republican,3.0,6.0,3.0,3.0,2.0,3.4
5,Jimmy Carter,Democratic,7.0,1.0,5.0,7.0,5.0,5.0
6,Ronald Reagan,Republican,6.0,4.0,2.0,6.0,3.0,4.2


This table ranks the presidents based on the averages shown in the previous table, from greatest to least. We can see that Bill Clinton has the lowest average rank at 2.8, and Donald Trump has the highest average rank at 5.2

In [116]:
fig = px.histogram(presidents_avg, x='president', y='lawful_permanent_resident_obt', color='party',
                   title='Average Lawful Permanent Resident Obtainees by President')
fig.update_layout(xaxis_title='President',
                  yaxis_title='Quantity')
fig.update_xaxes(categoryorder = 'total descending')
fig.show(renderer='notebook_connected')

In [117]:
fig = px.histogram(presidents_avg, x='president', y='refugee_arrivals', color='party',
                   title='Average Refugee Arrivals by President')
fig.update_layout(xaxis_title='President',
                  yaxis_title='Quantity')
fig.update_xaxes(categoryorder = 'total descending')
fig.show(renderer='notebook_connected')

In [126]:
fig = px.histogram(presidents_avg, x='president', y='noncitizen_apprehensions', color='party',
                   title='Average Noncitizen Apprehensions by President')
fig.update_layout(xaxis_title='President',
                  yaxis_title='Quantity')
fig.update_xaxes(categoryorder = 'total descending')
fig.show(renderer='notebook_connected')

In [127]:
fig = px.histogram(presidents_avg, x='president', y='noncitizen_removals', color='party',
                   title='Average Noncitizen Removals by President')
fig.update_layout(xaxis_title='President',
                  yaxis_title='Quantity')
fig.update_xaxes(categoryorder = 'total descending')
fig.show(renderer='notebook_connected')

Noncitizen removals seems to be the most divisive topic among these presidents, with the top 3 having much higher numbers than the bottom 4

In [128]:
fig = px.histogram(presidents_avg, x='president', y='noncitizen_returns', color='party',
                   title='Average Noncitizen Returns by President')
fig.update_layout(xaxis_title='President',
                  yaxis_title='Quantity')
fig.update_xaxes(categoryorder = 'total descending')
fig.show(renderer='notebook_connected')

Some findings from the bar charts:
* There is no clear divide in the numbers between the 2 parties. They're pretty evenly distributed in the rankings
* Noncitizen apprehensions and noncitizen returns have the same presidents making up the top 4, with almost identical numbers. We can observe that noncitizen apprehensions and returns are closely correlated
* Noncitizen removals is the most divisive topic, and seems to depend more on the person rather than the party. The top 3 have much higher numbers than the bottom 4, but both have an even split of democrats and republicans