In [2]:
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns 
import numpy as np 
import scipy.stats 
import plotly.express as px
import sys
import os
import dotenv

In [3]:
current_dir = os.getcwd()
project_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
src_path = os.path.join(project_dir, 'src')
sys.path.append(src_path)

from utils import load_env_vars
load_env_vars()

# Importing the Dataset

In [69]:
us_immigration_csv = os.getenv('us_immigration_csv')
df = pd.read_csv(us_immigration_csv)

# Initial Observations & Cleaning

In [70]:
# First look
df.head()

Unnamed: 0,Year,Immigrants Obtaining Lawful Permanent Resident Status,Refugee Arrivals,Noncitizen Apprehensions,Noncitizen Removals,Noncitizen Returns
0,1980,524295,207116,910361,18013,719211
1,1981,595014,159252,975780,17379,823875
2,1982,533624,98096,970246,15216,812572
3,1983,550052,61218,1251357,19211,931600
4,1984,541811,70393,1246981,18696,909833


In [None]:
# The columns needed to be renamed to a better format
df.rename(columns={'Immigrants Obtaining Lawful Permanent Resident Status': 'lawful_permanent_resident_obt',
                   'Refugee Arrivals': 'refugee_arrivals',
                   'Noncitizen Apprehensions': 'noncitizen_apprehensions',
                   'Noncitizen Removals': 'noncitizen_removals',
                   'Noncitizen Returns': 'noncitizen_returns'}, inplace=True)

Let's check to see what the datatypes of the columns are

In [72]:
df.dtypes

Year                              int64
lawful_permanent_resident_obt    object
refugee_arrivals                 object
noncitizen_apprehensions         object
noncitizen_removals              object
noncitizen_returns               object
dtype: object

Most of the columns needed to be converted to integers

In [None]:
df.replace(',', '',regex=True, inplace=True)
df['lawful_permanent_resident_obt'] = df['lawful_permanent_resident_obt'].astype('int')
df['refugee_arrivals'] = df['refugee_arrivals'].astype('int')
df['noncitizen_apprehensions'] = df['noncitizen_apprehensions'].astype('int')
df['noncitizen_removals'] = df['noncitizen_removals'].astype('int')
df['noncitizen_returns'] = df['noncitizen_returns'].astype('int')
df.head()

Unnamed: 0,Year,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns
0,1980,524295,207116,910361,18013,719211
1,1981,595014,159252,975780,17379,823875
2,1982,533624,98096,970246,15216,812572
3,1983,550052,61218,1251357,19211,931600
4,1984,541811,70393,1246981,18696,909833


Let's check if there are null values

In [55]:
df.isnull().sum()

Year                             0
lawful_permanent_resident_obt    0
refugee_arrivals                 0
noncitizen_apprehensions         0
noncitizen_removals              0
noncitizen_returns               0
dtype: int64

# EDA

In [87]:
from plotly.subplots import make_subplots

In [76]:
df.describe()

Unnamed: 0,Year,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns
count,42.0,42.0,42.0,42.0,42.0,42.0
mean,2000.5,914286.1,73705.333333,1136983.0,179951.142857,846411.6
std,12.267844,279746.2,37346.453158,345804.6,145240.084267,483426.6
min,1980.0,524295.0,11454.0,596560.0,15216.0,100454.0
25%,1990.25,665790.0,54399.5,894499.2,33498.5,359484.0
50%,2000.5,965664.0,69914.5,1132329.0,178963.5,921695.0
75%,2010.75,1061369.0,85211.0,1318212.0,323072.75,1103602.0
max,2021.0,1826595.0,207116.0,1865379.0,432201.0,1675876.0


In [91]:
lawful_permanent_resident_fig = px.line(df, x='Year', y='lawful_permanent_resident_obt', title='US Lawful Permanent Resident Obtainees By Year')
lawful_permanent_resident_fig.show()

In [85]:
refugee_arrivals_fig = px.line(df, x='Year', y='refugee_arrivals', title='US Refugee Arrivals By Year')
refugee_arrivals_fig.show()

In [93]:
noncitizen_apprehensions_fig = px.line(df, x='Year', y='noncitizen_apprehensions', title='US Non Citizen Apprehensions By Year')
noncitizen_apprehensions_fig.show()

In [86]:
noncitizen_removals_fig = px.line(df, x='Year', y='noncitizen_removals', title='US Noncitizen Removals By Year')
noncitizen_removals_fig.show()

In [94]:
noncitizen_returns_fig = px.line(df, x='Year', y='noncitizen_returns', title='US Noncitizen Returns By Year')
noncitizen_returns_fig.show()