In [2]:
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns 
import numpy as np 
import scipy.stats 
import plotly.express as px
import sys
import os
import dotenv

In [3]:
current_dir = os.getcwd()
project_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
src_path = os.path.join(project_dir, 'src')
sys.path.append(src_path)

from utils import load_env_vars
load_env_vars()

# Importing the Dataset

In [33]:
us_immigration_csv = os.getenv('us_immigration_csv')
df = pd.read_csv(us_immigration_csv)

# Initial Observations & Cleaning

In [34]:
# First look
df.head()

Unnamed: 0,Year,Immigrants Obtaining Lawful Permanent Resident Status,Refugee Arrivals,Noncitizen Apprehensions,Noncitizen Removals,Noncitizen Returns
0,1980,524295,207116,910361,18013,719211
1,1981,595014,159252,975780,17379,823875
2,1982,533624,98096,970246,15216,812572
3,1983,550052,61218,1251357,19211,931600
4,1984,541811,70393,1246981,18696,909833


In [35]:
# The columns needed to be renamed to a better format
df.rename(columns={"Immigrants Obtaining Lawful Permanent Resident Status": "lawful_permanent_resident_obt",
                   "Refugee Arrivals": "refugee_arrivals",
                   "Noncitizen Apprehensions": "noncitizen_apprehensions",
                   "Noncitizen Removals": "noncitizen_removals",
                   "Noncitizen Returns": "noncitizen_returns"}, inplace=True)

Let's check to see what the datatypes of the columns are

In [36]:
df.dtypes

Year                              int64
lawful_permanent_resident_obt    object
refugee_arrivals                 object
noncitizen_apprehensions         object
noncitizen_removals              object
noncitizen_returns               object
dtype: object

Most of the columns needed to be converted to integers

In [37]:
df.replace(",", "",regex=True, inplace=True)
df["lawful_permanent_resident_obt"] = df["lawful_permanent_resident_obt"].astype("int")
df["refugee_arrivals"] = df["refugee_arrivals"].astype("int")
df["noncitizen_apprehensions"] = df["noncitizen_apprehensions"].astype("int")
df["noncitizen_removals"] = df["noncitizen_removals"].astype("int")
df["noncitizen_returns"] = df["noncitizen_returns"].astype("int")
df.head()

Unnamed: 0,Year,lawful_permanent_resident_obt,refugee_arrivals,noncitizen_apprehensions,noncitizen_removals,noncitizen_returns
0,1980,524295,207116,910361,18013,719211
1,1981,595014,159252,975780,17379,823875
2,1982,533624,98096,970246,15216,812572
3,1983,550052,61218,1251357,19211,931600
4,1984,541811,70393,1246981,18696,909833


In [38]:
cols_add_commas = df.iloc[:, 1:].columns.tolist()
df[cols_add_commas] = df[cols_add_commas].map('{:,}'.format)