# Scraping All immigration court decisions

https://trac.syr.edu/phptools/immigration/closure/

In [16]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import os

# webdriver_path =  "/Users/joaburkh/Downloads/geckodriver"
# webdriver_path = '/Users/joaburkh/Downloads/chromedriver-mac-x64/chromedriver'

ops = Options()
ops.add_argument('--headless')
browser = webdriver.Firefox(options=ops)
# browser.headless = True
browser.get('https://trac.syr.edu/phptools/immigration/closure/')
browser.implicitly_wait(15)

In [17]:
# Get top-level data list buttons
axis_buttons = browser.find_elements(By.CLASS_NAME, 'truncate')
axis_button_text = [e.text for e in axis_buttons]
for b in axis_buttons:
    print(b.text)

Immigration Court State
Custody
Represented


In [18]:
# Open first axis option list and get list entries
axis_buttons[0].click()

# Set the first axis to 'Fiscal Year of Decision'
axis_value = 'Fiscal Year Completed'
axis1_options = [e for e in browser.find_elements(By.CLASS_NAME, "truncate") if e.text not in axis_button_text]
# axis1_options = browser.find_elements(By.ID, 'headlessui-listbox-options-2')
axis1_options_text = [e.text for e in axis1_options]
axis1_options[axis1_options_text.index(axis_value)].click()

# Re-calculate axis button text
axis_button_text = [e.text for e in axis_buttons]

In [19]:
# Open second axis option list and get list entries
axis_buttons[1].click()

# Set the second axis to 'Nationality'
axis_value = 'Nationality'
axis2_options = [e for e in browser.find_elements(By.CLASS_NAME, "truncate") if e.text not in axis_button_text]
axis2_options_text = [e.text for e in axis2_options]
axis2_options[axis1_options_text.index(axis_value)].click()

# Re-calculate axis button text
axis_button_text = [e.text for e in axis_buttons]

In [20]:
# Open third axis option list and get list entries
axis_buttons[2].click()

# Set the second axis to 'Nationality'
axis_value = 'Outcome (detailed)'
axis2_options = [e for e in browser.find_elements(By.CLASS_NAME, "truncate") if e.text not in axis_button_text]
axis2_options_text = [e.text for e in axis2_options]
axis2_options[axis1_options_text.index(axis_value)].click()

# Re-calculate axis button text
axis_button_text = [e.text for e in axis_buttons]

In [21]:
# Get list of tables
tables = browser.find_elements(By.CLASS_NAME, 'table-fixed')

In [38]:
import time
# Get list of entries along first axis (to be clicked on one by one)
table1_rows = tables[0].find_elements(By.CLASS_NAME, 'flex-row')
table1_rows = [row for row in table1_rows if row.text != '' and row.text.find('All') == -1 and not any(t in row.text for t in ['2023', '2022', '2019', '2006', '2024', '2005', '2020', '2009'])]

# Initialize data dict
data = {}

# Iterate over each year
for t1_row in table1_rows:
    # Calculate year and initialize data dictionary entry
    year = int(t1_row.text.rsplit(' ', 1)[0])
    data[year] = {}

    # Click on the row
    t1_row.click()

    # Recalculate Tables
    tables = browser.find_elements(By.CLASS_NAME, 'table-fixed')

    # Get list of entries along second axis (to be clicked on one by one)
    table2_rows = tables[1].find_elements(By.CLASS_NAME, 'flex-row')
    table2_rows = [row for row in table2_rows if row.text != '' and row.text.find('All') == -1]
    
    # Iterate over each nationality for current year
    for t2_row in table2_rows:
        # Calculate Nationality
        nationality = t2_row.text.rsplit(' ', 1)[0]

        # Click on the row
        t2_row.click()

        # Recalculate Tables
        tables = browser.find_elements(By.CLASS_NAME, 'table-fixed')

        # Get a list of entries along third axis (to be copied as-is)
        time.sleep(0.2)
        table3_rows = tables[2].find_elements(By.CLASS_NAME, 'flex-row')
        table3_rows = [row for row in table3_rows if row.text != '' and row.text.find('All') == -1]
        table3_text = [row.text.replace(',', '').rsplit(' ', 1) for row in table3_rows]
        table3_text = {row[0]: int(row[1]) for row in table3_text}

        # Add to data dictionary
        data[year][nationality] = table3_text

In [50]:
# Export to HDF
import pandas as pd 
from pathlib import Path

data_df = pd.concat({k: pd.DataFrame(v).T for k, v in data.items()}, axis=0) # https://stackoverflow.com/a/54300940
data_df.to_hdf(Path.cwd() / 'TRAC_AllImmCourtDecisions_Raw.hdf', key='AllImmCourtDecisions_1998to2024')

In [51]:
data_df = pd.read_hdf('TRAC_AsylumFilings_Raw.hdf', key='AsylumFilings_2001to2023')
# data_df.rename_axis(('Year', 'Nationality'), inplace=True)                        # rename indices for clarity
data_df

Unnamed: 0,Unnamed: 1,Pending,Grant Relief,Removal Order,Other Closure,Terminate Proceedings,Voluntary Departure,Pros. Discretion
2022,Venezuela,34852.0,1088.0,399.0,319.0,67.0,23.0,
2022,Honduras,24163.0,395.0,2084.0,1189.0,243.0,70.0,13.0
2022,Cuba,25745.0,147.0,174.0,358.0,150.0,,1.0
2022,Guatemala,19222.0,434.0,1815.0,1729.0,269.0,70.0,28.0
2022,Ecuador,13052.0,802.0,3147.0,624.0,64.0,66.0,28.0
...,...,...,...,...,...,...,...,...
2010,Swaziland,,,,,,,1.0
2010,Falkland Islands,,,1.0,,,,
2010,Slovenia,,,1.0,,,,
2010,Bahrain,,,,,1.0,,


In [52]:
browser.close()

# Pre-Processing Data

In [65]:
import pandas as pd

# Import Data
data_df = pd.read_hdf('TRAC_AllImmCourtDecisions_Raw.hdf')

# Set index name
data_df.index.rename(names=['Year', 'Nationality'], level=[0, 1], inplace=True)
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Removal Order,Voluntary Departure,Grant Relief,Terminate Proceedings,Other Closure,No NTA Filed,Credible Fear - not found,Other Case Type Completed,Reasonable Fear - not found,Withholding O...f not granted,...,Reasonable Fear - found,Credible Fear - other,Asylum Only - other,Reasonable Fear - other,Asylum Only -...f not granted,Asylum Only - relief granted,NACARA - relief granted,NACARA - other,NACARA - relief not granted,Pros. Discretion
Year,Nationality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2008,Mexico,81851.0,11588.0,5069.0,3106.0,1585.0,389.0,36.0,23.0,20.0,12.0,...,2.0,1.0,1.0,1.0,,,,,,
2008,El Salvador,12342.0,2207.0,2140.0,1147.0,2999.0,138.0,167.0,,27.0,32.0,...,3.0,,,1.0,2.0,,,,,
2008,Guatemala,14375.0,2549.0,1455.0,906.0,503.0,186.0,83.0,,22.0,15.0,...,1.0,,1.0,,1.0,,,,,
2008,Honduras,10431.0,1288.0,430.0,405.0,345.0,55.0,78.0,,22.0,18.0,...,3.0,1.0,1.0,,,,,,,
2008,China,2643.0,170.0,3736.0,468.0,325.0,45.0,58.0,2.0,1.0,1.0,...,2.0,,6.0,1.0,23.0,50.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,Aruba,1.0,,,,,,,,,,...,,,,,,,,,,
2021,Vanuatu,,,,,,1.0,,,,,...,,,,,,,,,,
2021,Lesotho,,,,1.0,,,,,,,...,,,,,,,,,,
2021,Cyprus,,,,1.0,,,,,,,...,,,,,,,,,,


Make sure that every year has same country rows and every country has the same decision columns

In [66]:
# Rectify missing country rows
unique_index1 = data_df.index.unique(0)
unique_index2 = data_df.index.unique(1)

new_index = pd.MultiIndex.from_product([unique_index1, unique_index2])

data_df = data_df.reindex(new_index, axis='index')


with pd.option_context('display.max_rows', 250):
    display(data_df.loc[(2015)]['Removal Order'])

Nationality
Mexico                           27314.0
El Salvador                      10907.0
Guatemala                        13710.0
Honduras                         15819.0
China                             1527.0
Haiti                              546.0
Colombia                           419.0
Cuba                               786.0
Dominican Republic                1058.0
Brazil                             291.0
Jamaica                            551.0
Ecuador                           1106.0
Nicaragua                          434.0
India                              587.0
Philippines                        233.0
Peru                               317.0
Venezuela                          132.0
Pakistan                           138.0
Indonesia                           52.0
Canada                             196.0
Nigeria                            213.0
Russia                             124.0
Former Countries                    69.0
South Korea                        112.0
Viet

In [67]:
# Change all NaN values to 0
data_df = data_df.fillna(value=0.0)

# Re-order DF by year (ascending) and country (ascending)
data_df = data_df.sort_index()
with pd.option_context('display.max_rows', 250):
    display(data_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,Removal Order,Voluntary Departure,Grant Relief,Terminate Proceedings,Other Closure,No NTA Filed,Credible Fear - not found,Other Case Type Completed,Reasonable Fear - not found,Withholding O...f not granted,...,Reasonable Fear - found,Credible Fear - other,Asylum Only - other,Reasonable Fear - other,Asylum Only -...f not granted,Asylum Only - relief granted,NACARA - relief granted,NACARA - other,NACARA - relief not granted,Pros. Discretion
Year,Nationality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1998,Afghanistan,72.0,25.0,121.0,23.0,47.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,Albania,252.0,124.0,233.0,64.0,28.0,27.0,2.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
1998,Algeria,51.0,31.0,51.0,25.0,6.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,3.0,0.0,4.0,8.0,0.0,0.0,0.0,0.0
1998,Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,Angola,9.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,Withheld by EOIR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021,Yemen,21.0,2.0,26.0,21.0,1.0,1.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2021,Zaire,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021,Zambia,3.0,1.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Merge Duplicate Country Names (identified manually)

In [70]:
# Remaining duplicates
dups = [
    ['Byelorussia (Belarus)', 'Belarus'],
    ['Holland', 'Netherlands'],
    ['Be Removed Fr...United States', 'Be Removed From the United States'],
    ['British India...ean Territory', 'British Indian Ocean Territory'],
    ['Federated Sta...of Micronesia', 'Federated States of Micronesia'],
    ['St. Vincent a...he Grenadines', 'St. Vincent and the Grenadines'],
    ['Stateless - A...ame A Country', 'Stateless - Alien Unable To Name A Country'],
    ['The Republic ...shall Islands', 'The Republic of the Marshall Islands']
]
for year in data_df.index.unique(0):
    for dup, orig in dups:
        if (dup in data_df.loc[year].index.unique(0)):
            data_df.rename(index={dup: orig}, inplace=True)   
            data_df = data_df.groupby(level=[0,1]).sum()                              # Sum the now-identical rows

Assign Regional Names to Countries (identified manually)

In [71]:
filename_win = "C:\\Users\\Joseph\\UMich_MSE_PhD\\UMich_SI649_InformationVisualization\\Project_StaticViz\\possible_nationalities.csv"
filename_mac = '/Users/joaburkh/Library/CloudStorage/GoogleDrive-joaburkh@umich.edu/Other computers/My Laptop/UMich_MSE_PhD/UMich_SI649_InformationVisualization/Project_StaticViz/possible_nationalities.csv'

region_df = pd.read_csv(filename_win, usecols=[0,1,2])
region_df.set_index('Nationality', inplace=True)

data_df = data_df.join(region_df)

In [73]:
data_df.set_index('Region', append=True, inplace=True)
data_df.set_index('Subregion', append=True, inplace=True)
data_df = data_df.reorder_levels(['Year', 'Region', 'Subregion', 'Nationality'])
data_df.sort_index(inplace=True)

In [74]:
with pd.option_context('display.max_rows', 250):
    with pd.option_context('display.expand_frame_repr', True):
        with pd.option_context('display.width', 1000):
            display(data_df.loc[(2021)][['Removal Order', 'Grant Relief']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Removal Order,Grant Relief
Region,Subregion,Nationality,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Central Africa,Burundi,7.0,11.0
Africa,Central Africa,Central African Republic,1.0,3.0
Africa,Central Africa,Congo,21.0,32.0
Africa,Central Africa,Democratic Republic of Congo,89.0,40.0
Africa,Central Africa,Rwanda,17.0,28.0
Africa,Central Africa,South Sudan,12.0,6.0
Africa,Central Africa,Zaire,0.0,0.0
Africa,Eastern Africa,Comoro Islands,0.0,0.0
Africa,Eastern Africa,Djibouti,0.0,5.0
Africa,Eastern Africa,Eritrea,28.0,120.0


Add a total column to make plotting easier

In [75]:
data_df['Total'] = data_df.sum(axis=1)

In [76]:
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Removal Order,Voluntary Departure,Grant Relief,Terminate Proceedings,Other Closure,No NTA Filed,Credible Fear - not found,Other Case Type Completed,Reasonable Fear - not found,Withholding O...f not granted,...,Credible Fear - other,Asylum Only - other,Reasonable Fear - other,Asylum Only -...f not granted,Asylum Only - relief granted,NACARA - relief granted,NACARA - other,NACARA - relief not granted,Pros. Discretion,Total
Year,Region,Subregion,Nationality,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1998,Africa,Central Africa,Burundi,98.0,6.0,8.0,3.0,10.0,3.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,133.0
1998,Africa,Central Africa,Central African Republic,7.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
1998,Africa,Central Africa,Congo,41.0,6.0,19.0,6.0,2.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0
1998,Africa,Central Africa,Democratic Republic of Congo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,Africa,Central Africa,Rwanda,96.0,11.0,16.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,134.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,South America,Northern South America,Venezuela,360.0,44.0,760.0,406.0,166.0,1333.0,398.0,1.0,17.0,7.0,...,0.0,8.0,0.0,3.0,1.0,0.0,0.0,0.0,15.0,4028.0
2021,South America,Western South America,Peru,162.0,33.0,120.0,174.0,32.0,54.0,59.0,0.0,7.0,5.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,677.0
2021,Stateless,Stateless,No Nationality,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2021,Stateless,Stateless,Stateless - Alien Unable To Name A Country,6.0,1.0,14.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,32.0


Convert all floats to int (cannot have fractions of people)

In [77]:
float_cols = data_df.select_dtypes(include=['float64']) # This will select float columns only
for col in float_cols.columns.values:
    data_df[col] = data_df[col].astype('int64')

data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Removal Order,Voluntary Departure,Grant Relief,Terminate Proceedings,Other Closure,No NTA Filed,Credible Fear - not found,Other Case Type Completed,Reasonable Fear - not found,Withholding O...f not granted,...,Credible Fear - other,Asylum Only - other,Reasonable Fear - other,Asylum Only -...f not granted,Asylum Only - relief granted,NACARA - relief granted,NACARA - other,NACARA - relief not granted,Pros. Discretion,Total
Year,Region,Subregion,Nationality,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1998,Africa,Central Africa,Burundi,98,6,8,3,10,3,0,0,0,0,...,0,3,0,2,0,0,0,0,0,133
1998,Africa,Central Africa,Central African Republic,7,3,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
1998,Africa,Central Africa,Congo,41,6,19,6,2,3,0,0,0,0,...,0,1,0,0,0,0,0,0,0,78
1998,Africa,Central Africa,Democratic Republic of Congo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,Africa,Central Africa,Rwanda,96,11,16,2,5,0,0,0,0,0,...,0,0,0,1,2,0,0,0,0,134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,South America,Northern South America,Venezuela,360,44,760,406,166,1333,398,1,17,7,...,0,8,0,3,1,0,0,0,15,4028
2021,South America,Western South America,Peru,162,33,120,174,32,54,59,0,7,5,...,0,1,0,1,0,0,0,0,5,677
2021,Stateless,Stateless,No Nationality,0,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2021,Stateless,Stateless,Stateless - Alien Unable To Name A Country,6,1,14,7,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,32


Save pre-processed dataframe to disk

In [78]:
data_df.to_hdf('TRAC_AllImmCourtDecisions_Cleaned.hdf', key='AllImmCourtDecisions_1998to2021')