In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import folium
import gmaps

In [None]:
# Stacy's code starts here

## Naturalization data table cleaning

In [None]:
# Load naturalization table
nat_file = 'fy2018_naturalization.xlsx'
filepath = os.path.join('.', 'Resources', nat_file)

naturalization = pd.read_excel(filepath, header=5)

In [None]:
# Drop notes data at end of document
naturalization.drop(labels=range(112,118), inplace=True)
naturalization.tail()

In [None]:
# Rename columns
naturalization.rename(columns={
    'filed': 'Petitions filed',
    'Total': 'Naturalized, total',
    'denied': 'Petitions denied'
}, inplace=True)

# Drop unneeded columns
naturalization = naturalization.drop(labels=['Civilian', 'Military 2', 'Not reported'], axis='columns')

In [None]:
# Find funny/footnoted years and fix them
for index, row in naturalization.iterrows():
    
    year = naturalization.loc[index, 'Year']
    
    if len(str(year)) > 4:
        year = int(year[0:5])
        naturalization.loc[index, 'Year'] = year
    else:
        pass

# Set year as index
naturalization.set_index(keys=['Year'], inplace=True)

In [None]:
# Change datatype to int for all columns
for c in naturalization.columns:
    naturalization[c] = naturalization[c].astype('int')

In [None]:
naturalization.head()

In [None]:
# # Clean up weird column spacing issues
# for c in nat_country.columns:
#     nat_country.rename(columns={
#     c: str(c).strip()
# }, inplace=True)
    
# nat_country.columns

In [None]:
# Test if samples from certain years have normal distribution



## Asylum seeker demographic data 2018 cleaning

In [None]:
# Load asylum seeker age/gender/etc table (2018)
asy_2018_file = 'fy2018_table18d_asylum_age_etc.xlsx'
filepath = os.path.join('.', 'Resources', asy_2018_file)

asylum_2018 = pd.read_excel(filepath, header=4)

In [None]:
asylum_2018.tail(10)

In [None]:
# Drop notes data at end of document
asylum_2018.drop(labels=range(34,38), inplace=True)
asylum_2018.tail()

In [None]:
# Split into sex [5:8], rename column
asylum_2018_sex = asylum_2018.iloc[1:4,0:2]
asylum_2018_sex.rename(columns={
    "Characteristic": "Sex"
}, inplace=True)

# reset index
asylum_2018_sex.set_index(keys='Sex', inplace=True)
asylum_2018_sex

# Rename "total" to reflect dataset scope
asylum_2018_sex.rename(columns={
    "Total": "Asylum 2018"
}, inplace=True)
asylum_2018_sex

In [None]:
# Split out broad age group [27:31]
asylum_2018_broad_age = asylum_2018.iloc[22:27,0:2]
asylum_2018_broad_age_total = asylum_2018_broad_age.drop([22])
asylum_2018_broad_age_total.rename(columns={
    "Characteristic": "Age"
}, inplace=True)
asylum_2018_broad_age_total.set_index(keys='Age', inplace=True)
asylum_2018_broad_age_total

# Rename "total" to reflect dataset scope
asylum_2018_broad_age_total.rename(columns={
    "Total": "Asylum 2018"
}, inplace=True)
asylum_2018_broad_age_total

In [None]:
# Split out marital status [32:38]
asylum_2018_marital = asylum_2018.iloc[28:34,0:2]
asylum_2018_marital.rename(columns={
    "Characteristic": "Marital Status"
}, inplace=True)
asylum_2018_marital.set_index(keys='Marital Status', inplace=True)
asylum_2018_marital

# Rename "total" to reflect dataset scope
asylum_2018_marital.rename(columns={
    "Total": "Asylum 2018"
}, inplace=True)
asylum_2018_marital

## Asylum seeker demographic data 2009 cleaning

In [None]:
# Cleaning asylum demographic data from 2009
asy_2009_file = 'fy_2009_table15d_asylum_age_etc.xls'
filepath = os.path.join('.', 'Resources', asy_2009_file)

asylum_2009 = pd.read_excel(filepath, header=5)

In [None]:
asylum_2009.head(10)

In [None]:
# Drop notes data at end of document
asylum_2009.drop(labels=range(37,41), inplace=True)
asylum_2009.tail()

In [None]:
# Split into sex [5:8], rename column
asylum_2009_sex = asylum_2009.iloc[1:4,0:2]
asylum_2009_sex.rename(columns={
    "Characteristic": "Sex"
}, inplace=True)
asylum_2009_sex

# Rename "total" to reflect dataset scope
asylum_2009_sex.rename(columns={
    "Total": "Asylum 2009"
}, inplace=True)

# reset index
asylum_2009_sex.set_index(keys='Sex', inplace=True)
asylum_2009_sex

In [None]:
# Split out broad age group [32:35]
asylum_2009_broad_age = asylum_2009.iloc[25:29,0:2]
asylum_2009_broad_age.rename(columns={
    "Characteristic": "Age"
}, inplace=True)

asylum_2009_broad_age['Age'] = asylum_2009_broad_age['Age'].str.strip()
asylum_2009_broad_age['Age']

asylum_2009_broad_age.set_index(keys='Age', inplace=True)

asylum_2009_broad_age.index

# Rename "total" to reflect dataset scope
asylum_2009_broad_age.rename(columns={
    "Total": "Asylum 2009"
}, inplace=True)

asylum_2009_broad_age

In [None]:
# Split out 2009 marital status
asylum_2009_marital = asylum_2009.iloc[31:37,0:2]
asylum_2009_marital.rename(columns={
    "Characteristic": "Marital Status"
}, inplace=True)
asylum_2009_marital.set_index(keys='Marital Status', inplace=True)

# Rename "total" to reflect dataset scope
asylum_2009_marital.rename(columns={
    "Total": "Asylum 2009"
}, inplace=True)

asylum_2009_marital

In [None]:
# # Clean up inconsistencies between df indices
# asylum_2009_broad_age.rename({'Under 16': '< 16', 
#                                           'Age 16 to 20': '16 - 20', 
#                                           'Age 21 and over': '21+'}, axis='index', inplace=True)

# asylum_2018_broad_age_total.rename({'Under 16 years': '< 16', 
#                                           '16 to 20 years': '16 - 20', 
#                                           '21 years and over': '21+'}, axis='index', inplace=True)

In [None]:
# # Merge LPR and Asyulym seeker datasets

# # Join asylum datasets together
# lpr_asylum_sex = asylum_2009_sex.merge(asylum_2018_sex, how="inner", left_index=True, right_index=True)
# lpr_asylum_age = asylum_2009_broad_age.merge(asylum_2018_broad_age_total, how="inner", left_index=True, right_index=True)
# lpr_asylum_marital = asylum_2009_marital.merge(asylum_2018_marital, how="inner", left_index=True, right_index=True)

In [None]:
lpr_asylum_sex

In [None]:
# Stacy's code ends here

In [None]:
# Kana's code starts here

# State data table cleaning

In [7]:
# Read bystate csv data
bystate = pd.read_csv("Resources/Immigrants Data By State.csv")

# Dropna
bystate = bystate.dropna()

# Drop others
bystate.drop(bystate.tail(1).index,inplace=True)

# Change data to integer
bystate.iloc[:,1:20].astype(int)

# Show the dataframe
bystate.head()

Unnamed: 0,State,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Alabama,1894,2246,2562,1689,2247,4200,4277,3393,3877,3891,3740,4063,3873,3848,3685,3928,4736,3801,3737
1,Alaska,1364,1389,1557,1188,1261,1524,1554,1617,1534,1608,1703,1799,1612,1460,1505,1572,1726,1547,1375
2,Arizona,11935,16197,17588,10955,19507,18986,21529,17528,20638,20997,18243,20333,18434,16097,16908,17997,20694,19344,18335
3,Arkansas,1594,2561,2531,1903,2288,2698,2924,2722,2997,2942,2684,2874,2795,2900,2793,2814,3158,3071,3000
4,California,216447,281469,289422,175579,253858,232014,264667,228941,238444,227876,208446,210591,196622,191806,198379,209568,223141,214243,200897


In [8]:
# List of US state abbreviation
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

# Reverse key and value
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

In [9]:
# Create an empty dictionary list
dict_list=[]

# For each key and value in dictionary, combine them and add them to a list
for key,value in abbrev_us_state.items():
    dict_list.append((key,value))
    
# Print the list
#print (dict_list)

In [10]:
# Create a dataframe using dictionary list
state_abbrev = pd.DataFrame(dict_list)
state_abbrev.columns = ["Abbrev","State"]

# Show the dataframe
state_abbrev.head()

Unnamed: 0,Abbrev,State
0,AL,Alabama
1,AK,Alaska
2,AZ,Arizona
3,AR,Arkansas
4,CA,California


In [19]:
# Merge bystate data and state abbreveation dataframe
state_df = pd.merge(bystate, state_abbrev, on = "State")

# Rename columns
state_df = state_df.rename(columns = {"State" : "State Name",
                                      "Abbrev" : "State"})

# Show the dataframe
state_df.head()

Unnamed: 0,State Name,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,State
0,Alabama,1894,2246,2562,1689,2247,4200,4277,3393,3877,...,3740,4063,3873,3848,3685,3928,4736,3801,3737,AL
1,Alaska,1364,1389,1557,1188,1261,1524,1554,1617,1534,...,1703,1799,1612,1460,1505,1572,1726,1547,1375,AK
2,Arizona,11935,16197,17588,10955,19507,18986,21529,17528,20638,...,18243,20333,18434,16097,16908,17997,20694,19344,18335,AZ
3,Arkansas,1594,2561,2531,1903,2288,2698,2924,2722,2997,...,2684,2874,2795,2900,2793,2814,3158,3071,3000,AR
4,California,216447,281469,289422,175579,253858,232014,264667,228941,238444,...,208446,210591,196622,191806,198379,209568,223141,214243,200897,CA


In [24]:
# Create a data frame for 2000 and 2018 data
output_state = state_df[["State","2000","2018"]]

# Output cleaned data to csv
output_state.to_csv("Output_files,Immigrants By State.csv", index = False)

# US Population vs Immigrants Data Cleanup

In [3]:
# Reading Data for Permanent_Resident_Years 
us_immigrants = pd.read_excel("Resources/MPI-Data-Hub_Immigrants_N-Percent-US-Population_2017_0.xlsx", index = False)

# Data Cleaning
us_immigrants = us_immigrants.drop([0,1,2,3,4,5,31,32,33])

# Make the first row as header
us_immigrants = us_immigrants.rename(columns=us_immigrants.iloc[0]).drop(us_immigrants.index[0])

# Change year data to numeric value
us_immigrants.Year= pd.to_numeric(us_immigrants.Year)

# Change number of immigrants data to numeric value
us_immigrants["Number of Immigrants"] = pd.to_numeric(us_immigrants["Number of Immigrants"])

# Change Immigrants as a Percentage of the U.S. Population (%) to numeric value
us_immigrants["Immigrants as a Percentage of the U.S. Population (%)"] = pd.to_numeric(us_immigrants["Immigrants as a Percentage of the U.S. Population (%)"])

# Retrieve data only after 1950
us_immigrants = us_immigrants.loc[us_immigrants["Year"] >= 1950]

# Output cleaned data csv
us_immigrants.to_csv("Output_files/Immigrants Population.csv", index=False) 

# Display the data
us_immigrants

Unnamed: 0,Year,Number of Immigrants,Immigrants as a Percentage of the U.S. Population (%)
17,1950,10347400,6.9
18,1960,9738100,5.4
19,1970,9619300,4.7
20,1980,14079900,6.2
21,1990,19767300,7.9
22,2000,31107900,11.1
23,2010,39955900,12.9
24,2011,40377900,13.0
25,2012,40824700,13.0
26,2013,41348100,13.1


In [27]:
# Reading Data for US population
world_population = pd.read_csv("Resources/WPP2019_TotalPopulationBySex.csv")

# Get US data
us_population = world_population.loc[world_population["Location"] == "United States of America"]

# Retrive needed columns
us_population = us_population[["Location","Time","PopTotal"]]

# Get data before 2018
us_population = us_population.loc[us_population["Time"] <= 2018]
us_population

Unnamed: 0,Location,Time,PopTotal
264221,United States of America,1950,158804.397
264222,United States of America,1951,160872.264
264223,United States of America,1952,163266.026
264224,United States of America,1953,165909.996
264225,United States of America,1954,168736.390
...,...,...,...
264285,United States of America,2014,318673.422
264286,United States of America,2015,320878.312
264287,United States of America,2016,323015.992
264288,United States of America,2017,325084.758


In [28]:
# Create an population empty list
population_list = []

# For each Pop Total, multiply 1000
for year in us_population["PopTotal"]:
    population = year * 1000
    population_list.append(population)

# Add population list data to population column
us_population["Population"] = population_list

# Show the dataframe
us_population

Unnamed: 0,Location,Time,PopTotal,Population
264221,United States of America,1950,158804.397,158804397.0
264222,United States of America,1951,160872.264,160872264.0
264223,United States of America,1952,163266.026,163266026.0
264224,United States of America,1953,165909.996,165909996.0
264225,United States of America,1954,168736.390,168736390.0
...,...,...,...,...
264285,United States of America,2014,318673.422,318673422.0
264286,United States of America,2015,320878.312,320878312.0
264287,United States of America,2016,323015.992,323015992.0
264288,United States of America,2017,325084.758,325084758.0


In [34]:
# Only show certain years to match us_immigrants data
selected_us_population = us_population.loc[(us_population["Time"] == 1950) | (us_population["Time"] == 1960) | (us_population["Time"] == 1970) | \
                                           (us_population["Time"] == 1980) | (us_population["Time"] == 1990) | (us_population["Time"] == 2000) | \
                                           (us_population["Time"] == 2010) | (us_population["Time"] == 2011) | (us_population["Time"] == 2012) | \
                                           (us_population["Time"] == 2013) | (us_population["Time"] == 2014) | (us_population["Time"] == 2015) | \
                                           (us_population["Time"] == 2016) | (us_population["Time"] == 2017)]

# Rename time to year
selected_us_population = selected_us_population.rename(columns = {"Time" : "Year",
                                                                 "Population" : "US Population"})

# Merge us_population dataframe and us_immigrants dataframe
population_comparison = pd.merge(us_immigrants, selected_us_population, on = "Year")

# Select datafames to show
population_comparison = population_comparison[["Year", "Number of Immigrants", "US Population"]]

# Output cleaned data to csv file
population_comparison.to_csv("Output_files/US Population vs Immigrants.csv", index = False)


In [None]:
# Kana's code ends here

In [None]:
# Satish start

In [None]:
#Reading Data for Permanent_Resident_Years 
immigration_df=pd.read_csv("Resources/Permanent_Resident_Years.csv")
immigration_df['Number'] = [x.replace(',', '') for x in immigration_df['Number']]

In [None]:
#Data Cleaning
immigration_df.Year=pd.to_numeric(immigration_df.Year)
immigration_df.Number=pd.to_numeric(immigration_df.Number)
#Plotting the Graph
immigration_plt=immigration_df.plot(kind="line", x="Year", y="Number", grid=True, figsize=(15,10),legend=False,title="Number of Lawful Permanent Resident Status Vs. Years")
#Finding Max value
max_arrow_y=immigration_df['Number'].max()
max_arrow_x=immigration_df.loc[immigration_df['Number']==max_arrow_y,"Year"].reset_index(drop=True)
#Printing Max value in graph
plt.annotate(
    f"maximum {max_arrow_x[0],max_arrow_y}", 
    xy=(max_arrow_x[0], max_arrow_y))
#Finding Minimum value
min_arrow_y=immigration_df['Number'].min()
min_arrow_x=immigration_df.loc[immigration_df['Number']==min_arrow_y,"Year"].reset_index(drop=True)
#Printing Min Value in graph
plt.annotate(
    f"Minimum {min_arrow_x[0],min_arrow_y}", 
    xy=(min_arrow_x[0], min_arrow_y))
#Labeling the Graph
plt.ylabel("Number of Lawful Permanent Resident Status")
plt.xlabel("Timepoint in Years")
plt.tight_layout()
plt.show()

In [None]:
#Reading CSV
Country_Data_2018=pd.read_csv("Resources/Country_Data_2018.csv")
Country_Data_1999=pd.read_csv("Resources/Country_Data_1999.csv")
Country_Data_2009=pd.read_csv("Resources/Country_Data_2009.csv")

In [None]:
#Data cleaning,Removing extra column
Country_Data_1999=Country_Data_1999.iloc[:, :-1]

In [None]:
#Removing Data which has No value
Country_Data_2018_df=Country_Data_2018.dropna()
Country_Data_1999_df=Country_Data_1999.dropna()
Country_Data_2009_df=Country_Data_2009.dropna()

In [None]:
#Extract First Column so that it can be used for Name as Header
new_header_2018 = Country_Data_2018_df.iloc[0]
new_header_1999 = Country_Data_1999_df.iloc[0]
new_header_2009 = Country_Data_2009_df.iloc[0]

In [None]:
#Renaming the Header removing first row
Country_Data_2018_df.columns=new_header_2018
Country_Data_2018_df=Country_Data_2018_df[1:]

Country_Data_1999_df.columns=new_header_1999
Country_Data_1999_df=Country_Data_1999_df[1:]


Country_Data_2009_df.columns=new_header_2009
Country_Data_2009_df=Country_Data_2009_df[1:]

In [None]:
#List of Countries in central America
Central_America_Data=['Mexico', 'Guatemala', 'Honduras', 'Nicaragua', 'El Salvador', 'Costa Rica', 'Panama', 'Belize']
#Getting only records of Central America from main Data Set
Latin_Data_df=Country_Data_2018_df[Country_Data_2018_df['Region and country of birth'].isin(Central_America_Data)]
#Data Cleaning
Latin_Data_df=Latin_Data_df.apply(lambda x: x.str.replace(',',''))

In [None]:
#Creating new DataFrame for required Data
Latin_Data_summ=[['2014',pd.to_numeric(Latin_Data_df['2014']).sum()],['2015',pd.to_numeric(Latin_Data_df['2015']).sum()],['2017',pd.to_numeric(Latin_Data_df['2017']).sum()],['2018',pd.to_numeric(Latin_Data_df['2018']).sum()]]
Latin_Data_summ_df=pd.DataFrame(Latin_Data_summ, columns = ['Year', 'Count'])

In [None]:
#Bar Graph showing the Central America and Years
Latin_Data_summ_df.plot.bar(x='Year', y='Count', rot=0,legend=False)
plt.axis('tight')
plt.title("Number of Immigrants Vs Year")
plt.ylabel("Total Number of Immigrants from Central America")
plt.xlabel("Year")
plt.tight_layout()
plt.show()

In [None]:
#List of Islamic Countries 
Islam_Country_Data=['Afghanistan','Iran','Yemen','Jordan','Saudi Arabia','Sudan','Pakistan','Syria','Oman']
#Data Set till 1999 
Islam_Data_1999_df=Country_Data_1999_df[Country_Data_1999_df['Region and country of birth'].isin(Islam_Country_Data)]
#Data Cleaning
Islam_Data_1999_df=Islam_Data_1999_df.apply(lambda x: x.str.replace(',',''))

In [None]:
#Data Set form 2000 to 2009 
Islam_Data_2009_df=Country_Data_2009_df[Country_Data_2009_df['Region and country of birth'].isin(Islam_Country_Data)]
#Data Cleaning
Islam_Data_2009_df=Islam_Data_2009_df.apply(lambda x: x.str.replace(',',''))

In [None]:
#Merging the DataFrame
Merge_Islam_Country=pd.merge(Islam_Data_1999_df,Islam_Data_2009_df,how='outer')
#New DataFrame with reuqired Dataset
Islam_Country_summ=[['1999',pd.to_numeric(Merge_Islam_Country['1999']).sum()],['2000',pd.to_numeric(Merge_Islam_Country['2000']).sum()],['2005',pd.to_numeric(Merge_Islam_Country['2005']).sum()],['20006',pd.to_numeric(Merge_Islam_Country['2006']).sum()]]
Islam_Country_summ=pd.DataFrame(Islam_Country_summ, columns = ['Year', 'Count'])

In [None]:
#Ployyinh the Graph
Islam_Country_summ.plot.bar(x='Year', y='Count', rot=0,legend=False)
plt.axis('tight')
plt.title("Number of Immigrants Vs Year")
plt.ylabel("Total Number of Immigrants from Arab Countries")
plt.xlabel("Year")
plt.tight_layout()
plt.show()

In [None]:
# Satish end

In [None]:
# Umar's code starts here
#Read in the Excel file and view the headers
Lawful_df = pd.read_excel("./Resources/fy2018_Lawful.xlsx", header=4)
Lawful_df.head()

In [None]:
#Use the iloc function to locate the point of interest in a data set
Broad_age = Lawful_df.iloc[19:23,:]
Broad_age

In [None]:
#View the characteristic and Total columns 
Broad_age_df = pd.DataFrame(Broad_age)
Sex = Broad_age_df.iloc[:, 0:2]
Sex

In [None]:
#Rename your columns 
Cleaned = Sex.rename(columns={"Characteristic": "Age", "Total": "Lawful 2018"})
Index_age = Cleaned.set_index("Age")
Index_age

In [None]:
#View a single row 
Sex = Lawful_df.iloc[19,1:]
Sex.to_frame(name="LPR 2018")

In [None]:
#Use the iloc function to locate the point of interest in a data set
Marital_status = Lawful_df.iloc[24:30,:]
Marital_status

In [None]:
#View the characteristic and Total columns 
New_marital_df = pd.DataFrame(Marital_status)
Specific = New_marital_df.iloc[:, 0:2]
Specific

In [None]:
#Rename your columns 
Renamed = Specific.rename(columns={"Characteristic": "Marital Status", "Total": "Lawful Permanent Resident 2018"})
Renamed.head()

In [None]:
#View a single column 
Status = Lawful_df.iloc[24:30,0:2]
Name18 = Status.rename(columns={"Characteristic": "Marital Status", "Total": "Lawful 2018"})
#Set marital status as index for clarity
Name18.set_index("Marital Status", inplace=True)
Name18

In [None]:
#Read in the Excel file and 
Lawful09_df = pd.read_excel("./Resources/fy2009_Lawful.xls", header=4)
Lawful09_df.head()

In [None]:
#Use the iloc function to locate the point of interest in a data set
Broad09_age = Lawful09_df.iloc[20:24,:]
Broad09_age

In [None]:
#View the characteristic and Total columns 
Broad09_age_df = pd.DataFrame(Broad09_age)
Sex09 = Broad09_age_df.iloc[:, 0:2]
Sex09

In [None]:
#Rename your columns 
Age09 = Sex09.rename(columns={"Characteristic": "Age", "Total": "Lawful Permanent Resident 2009"})
New_index09 = Age09.set_index("Age")
New_index09

In [None]:
#View a single row 
Sex09 = Lawful09_df.iloc[20,1:]
Sex09 = Sex09.to_frame(name="LPR 2009")

In [None]:
Sex09

In [None]:
#Use the iloc function to locate the point of interest in a data set
Marital09_status = Lawful09_df.iloc[26:32,:]
Marital09_status

In [None]:
#View the characteristic and Total columns 
New09_marital_df = pd.DataFrame(Marital09_status)
Specific09 = New09_marital_df.iloc[:, 0:2]
Specific09

In [None]:
#Rename your columns 
Renamed09 = Specific09.rename(columns={"Characteristic": "Marital Status", "Total": "Lawful Permanent Resident 2009"})
Renamed09.head()

In [None]:
#View a single column
Status09 = Lawful09_df.iloc[26:32,0:2]
Name09 = Status09.rename(columns={"Characteristic": "Marital Status", "Total": "Lawful 2009"})
#Set marital status as index for clarity
Name09.set_index("Marital Status", inplace=True)
Name09

#Umar Code ends here

### Merge LPR and asylum seeker datasets

In [None]:
# Stacy code for merging datasets starts here

In [None]:
# Clean up inconsistencies between df indices and types
asylum_2009_broad_age.rename({'Under 16': '< 16', 
                                          'Age 16 to 20': '16 - 20', 
                                          'Age 21 and over': '21+'}, axis='index', inplace=True)

asylum_2018_broad_age_total.rename({'Under 16 years': '< 16', 
                                          '16 to 20 years': '16 - 20', 
                                          '21 years and over': '21+'}, axis='index', inplace=True)

sex_18 = Sex.to_frame(name="Lawful 2018")

In [None]:
# Merge LPR and Asyulym seeker datasets

# Join asylum datasets together
lpr_asylum_sex = asylum_2009_sex.merge(asylum_2018_sex, how="inner", left_index=True, right_index=True)
lpr_asylum_age = asylum_2009_broad_age.merge(asylum_2018_broad_age_total, how="inner", left_index=True, right_index=True)
lpr_asylum_marital = asylum_2009_marital.merge(asylum_2018_marital, how="inner", left_index=True, right_index=True)

# Join LPR datasets together 
lpr_age = New_index09.merge(Index_age, how="inner", left_index=True, right_index=True)
lpr_sex = Sex09.merge(sex_18, how="inner", left_index=True, right_index=True)
lpr_marital = Name09.merge(Name18, how="inner", left_index=True, right_index=True)

In [None]:
# Clean up inconsistencies on index and column names

for c in lpr_age.index:
    lpr_age.rename(index={
    c: str(c).strip()
}, inplace=True)
    
lpr_age.rename({'Under 16 years': '< 16', 
               '16 to 20 years': '16 - 20', 
               '21 years and over': '21+'}, axis='index', inplace=True)

lpr_age.rename({'Lawful Permanent Resident 2009': 'Lawful 2009'})

lpr_sex.rename(columns={'LPR 2009': 'Lawful 2009'}, inplace=True)

In [None]:
lpr_asylum_sex = lpr_asylum_sex.merge(lpr_sex, how="inner", left_index=True, right_index=True)
lpr_asylum_age = lpr_asylum_age.merge(lpr_age, how="inner", left_index=True, right_index=True)
lpr_asylum_marital = lpr_asylum_marital.merge(lpr_marital, how="inner", left_index=True, right_index=True)

In [None]:
lpr_asylum_sex

In [None]:
# Export cleaned data to Outputs folder

filename = 'la_sex.csv'
path = os.path.join('.', 'Output_files', filename)
lpr_asylum_sex.to_csv(path)

filename = 'la_age.csv'
path = os.path.join('.', 'Output_files', filename)
lpr_asylum_age.to_csv(path)

filename = 'la_marital.csv'
path = os.path.join('.', 'Output_files', filename)
lpr_asylum_marital.to_csv(path)

In [None]:
# Stacy code for merging datasets ends here