# Cleaning The Missing Migrants Files

## A:  Import Cleaning Packages

In [1]:
# Install packages if not already installed
#!pip install pandas
#!pip install numpy
#!pip install geopy

In [2]:
# Packages for Cleaning the Dataset
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:,.12f}'.format
%matplotlib inline

In [3]:
# Packages to find countries from coordinates
from geopy.geocoders import Nominatim

## B: Import Dataset 
Online Excel and CSV files available

In [4]:
# Import the raw dataset from IOM
# https://missingmigrants.iom.int/downloads 
#MM = pd.read_excel('https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/Missing_Migrants_Global_Figures_allData.xlsx', index_col = False)
MM = pd.read_csv('https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/Missing_Migrants_Global_Figures_allData.csv?154546', index_col = False, encoding = 'unicode_escape')

## C:  Columns to New IOM Column Format

In [5]:
# View DFs Original Column List
MM.columns

Index(['ï»¿"Main ID"', 'Incident ID', 'Incident Type', 'Region of Incident',
       'Incident Date', 'Incident Year', 'Month', 'Number of Dead',
       'Minimum Estimated Number of Missing',
       'Total Number of Dead and Missing', 'Number of Survivors',
       'Number of Females', 'Number of Males', 'Number of Children',
       'Country of Origin', 'Region of Origin', 'Cause of Death',
       'Country of Incident', 'Migration Route', 'Location of Incident',
       'Coordinates', 'UNSD Geographical Grouping', 'Information Source',
       'URL', 'Source Quality'],
      dtype='object')

In [6]:
pd.set_option('display.max_columns', None)
MM.head()

Unnamed: 0,"ï»¿""Main ID""",Incident ID,Incident Type,Region of Incident,Incident Date,Incident Year,Month,Number of Dead,Minimum Estimated Number of Missing,Total Number of Dead and Missing,Number of Survivors,Number of Females,Number of Males,Number of Children,Country of Origin,Region of Origin,Cause of Death,Country of Incident,Migration Route,Location of Incident,Coordinates,UNSD Geographical Grouping,Information Source,URL,Source Quality
0,2014.MMP00001,2014.MMP00001,Incident,North America,2014-01-06,2014,January,1.0,0.0,1,,,1.0,,Guatemala,Central America,Mixed or unknown,United States of America,US-Mexico border crossing,Pima Country Office of the Medical Examiner ju...,"31.650259, -110.366453",Northern America,Pima County Office of the Medical Examiner (PC...,http://humaneborders.info/,5
1,2014.MMP00002,2014.MMP00002,Incident,North America,2014-01-12,2014,January,1.0,0.0,1,,1.0,,,Unknown,Latin America / Caribbean (P),Mixed or unknown,United States of America,US-Mexico border crossing,Pima Country Office of the Medical Examiner ju...,"31.59713, -111.73756",Northern America,Pima County Office of the Medical Examiner (PC...,,5
2,2014.MMP00003,2014.MMP00003,Incident,North America,2014-01-14,2014,January,1.0,0.0,1,,,1.0,,Unknown,Latin America / Caribbean (P),Mixed or unknown,United States of America,US-Mexico border crossing,Pima Country Office of the Medical Examiner ju...,"31.94026, -113.01125",Northern America,Pima County Office of the Medical Examiner (PC...,,5
3,2014.MMP00004,2014.MMP00004,Incident,North America,2014-01-16,2014,January,1.0,0.0,1,,,1.0,,Mexico,Central America,Violence,United States of America,US-Mexico border crossing,"near Douglas, Arizona, USA","31.506777, -109.315632",Northern America,"Ministry of Foreign Affairs Mexico, Pima Count...",http://bit.ly/1qfIw00,5
4,2014.MMP00005,2014.MMP00005,Incident,Europe,2014-01-16,2014,January,1.0,0.0,1,2.0,,1.0,,Sudan,Northern Africa,Harsh environmental conditions / lack of adequ...,Russian Federation,,Border between Russia and Estonia,"59.1551, 28",Northern Europe,EUBusiness (Agence France-Presse),http://bit.ly/1rTFTjR,1


In [7]:
# Drop Useless Column
MM = MM.drop('ï»¿"Main ID"', axis = 1)

In [8]:
print(MM.columns)

Index(['Incident ID', 'Incident Type', 'Region of Incident', 'Incident Date',
       'Incident Year', 'Month', 'Number of Dead',
       'Minimum Estimated Number of Missing',
       'Total Number of Dead and Missing', 'Number of Survivors',
       'Number of Females', 'Number of Males', 'Number of Children',
       'Country of Origin', 'Region of Origin', 'Cause of Death',
       'Country of Incident', 'Migration Route', 'Location of Incident',
       'Coordinates', 'UNSD Geographical Grouping', 'Information Source',
       'URL', 'Source Quality'],
      dtype='object')


In [9]:
# List New Column Names
columns = ["Incident_ID", "Incident_Type", "Region", "Reported_Date", "Reported_Year", 
              "Reported_Month", "Number_Dead", "Minimum_Missing", "Total_Dead_and_Missing", "Survivors", 
              "Females", "Males", "Children", "Country_of_Origin", "Region_of_Origin", "Cause_of_Death",
              "Country_of_Incident", "Migration_Route", "Location_Description", "Coordinates", "UNSD_Geographical_Grouping", 
           "Information_Source", "URL", "Source_Quality"]

In [10]:
# Ensure that your new columns are matching the DF
index = 0
for i in columns:
    print(f'{index + 1} - {columns[index]} ::: {MM.columns[index]}')
    index = index + 1

1 - Incident_ID ::: Incident ID
2 - Incident_Type ::: Incident Type
3 - Region ::: Region of Incident
4 - Reported_Date ::: Incident Date
5 - Reported_Year ::: Incident Year
6 - Reported_Month ::: Month
7 - Number_Dead ::: Number of Dead
8 - Minimum_Missing ::: Minimum Estimated Number of Missing
9 - Total_Dead_and_Missing ::: Total Number of Dead and Missing
10 - Survivors ::: Number of Survivors
11 - Females ::: Number of Females
12 - Males ::: Number of Males
13 - Children ::: Number of Children
14 - Country_of_Origin ::: Country of Origin
15 - Region_of_Origin ::: Region of Origin
16 - Cause_of_Death ::: Cause of Death
17 - Country_of_Incident ::: Country of Incident
18 - Migration_Route ::: Migration Route
19 - Location_Description ::: Location of Incident
20 - Coordinates ::: Coordinates
21 - UNSD_Geographical_Grouping ::: UNSD Geographical Grouping
22 - Information_Source ::: Information Source
23 - URL ::: URL
24 - Source_Quality ::: Source Quality


In [11]:
# Set new column names
MM.columns = columns

In [12]:
print(MM.head())

     Incident_ID Incident_Type         Region Reported_Date  Reported_Year  \
0  2014.MMP00001      Incident  North America    2014-01-06           2014   
1  2014.MMP00002      Incident  North America    2014-01-12           2014   
2  2014.MMP00003      Incident  North America    2014-01-14           2014   
3  2014.MMP00004      Incident  North America    2014-01-16           2014   
4  2014.MMP00005      Incident         Europe    2014-01-16           2014   

  Reported_Month    Number_Dead  Minimum_Missing  Total_Dead_and_Missing  \
0        January 1.000000000000   0.000000000000                       1   
1        January 1.000000000000   0.000000000000                       1   
2        January 1.000000000000   0.000000000000                       1   
3        January 1.000000000000   0.000000000000                       1   
4        January 1.000000000000   0.000000000000                       1   

       Survivors        Females          Males  Children Country_of_Origin

## D: Cleaning Cause_of_Death and Removing all NANS

In [13]:
MM['Country_of_Origin'].unique()

array(['Guatemala', 'Unknown', 'Mexico', 'Sudan', 'Afghanistan',
       'Iran (Islamic Republic of)', 'Haiti', 'Myanmar', 'Cameroon',
       'Somalia', 'Bangladesh', 'Ethiopia', 'Albania', 'Cambodia',
       'Ecuador', 'El Salvador', 'Syrian Arab Republic', 'Congo',
       'Zimbabwe', 'Eritrea,Ethiopia,Sudan,Unknown', 'Honduras,Mexico',
       'China', 'Eritrea,Ethiopia,Somalia', 'Eritrea', 'Niger',
       'Ethiopia,Somalia', 'Central African Republic', 'Indonesia',
       'Morocco,Syrian Arab Republic,Unknown',
       'Cameroon,Somalia,Syrian Arab Republic',
       'Eritrea,Syrian Arab Republic,Unknown',
       'Egypt,Syrian Arab Republic',
       "CÃ´te d'Ivoire,Eritrea,Guinea,Libya,Mali,Palestinian Territories,Somalia,Sudan,Syrian Arab Republic",
       'Cuba', 'Dominican Republic,Haiti',
       'Egypt,Palestinian Territories,Sudan,Syrian Arab Republic',
       'Egypt,State of Palestine,Syrian Arab Republic', 'Honduras',
       'Dominican Republic', 'Peru', 'State of Palestine,Sudan

In [14]:
type(MM['Country_of_Origin'][1])

str

In [15]:
new_coo = []
index = 0
for i in MM['Country_of_Origin']:
    if type(MM['Country_of_Origin'][index]) != str:
        MM['Country_of_Origin'][index] = str(MM['Country_of_Origin'][index])
    else:
        pass
    index = index + 1
    

        

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  MM['Country_of_Origin'][index] = str(MM['Country_of_Origin'][index])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [16]:
for i in new_coo:
    print(type())


In [17]:
print(new_coo)

[]


In [18]:
new_coo2 = []
index = 0
for i in MM['Country_of_Origin']:
    if type(MM.loc[index, 'Country_of_Origin']) != str:
        MM.loc[index, 'Country_of_Origin'] = str(MM.loc[index,'Country_of_Origin'])
    else:
        pass
    index = index + 1
    

In [19]:
for i in new_coo2:
    print(type())

In [20]:
print(new_coo)

[]


In [21]:
index = 0
infrequent_countries = ['nan', 'Viet Nam', 'Liberia', 'Burundi', 'Lesotho', 'Mauritania', 'Malawi', 'Uzbekistan', 'Nepal', 'Madagascar',
                        'Mozambique', 'Lebanon', 'Costa Rica', 'Malaysia', 'Kenya', 'Central African Republic', 'Kyrgyzstan', 'Jamaica',
                        'Uruguay', 'Israel', 'Eswatini', 'Paraguay', 'Albania', 'Guyana', 'Republic of Korea', "Lao People's Democratic Republic",
                        'Oman', 'South Sudan', 'Burkina Faso', 'Bahamas', 'Papua New Guinea', 'Belize', 'Georgia', 'Togo', 'Russian Federation']
for i in MM['Country_of_Origin']:
    if "Unknown" in MM['Country_of_Origin'][index]:
        MM['Country_of_Origin'][index] = "Unknown"
    elif "," in MM['Country_of_Origin'][index]:
        MM['Country_of_Origin'][index] = "Multiple Countries"
    elif "Mixed" in MM['Country_of_Origin'][index]:
        MM['Country_of_Origin'][index] = "Multiple Countries"
    elif "multiple" in MM['Country_of_Origin'][index]:
        MM['Country_of_Origin'][index] = "Multiple Countries"
    elif MM['Country_of_Origin'][index] in infrequent_countries:
        MM["Country_of_Origin"][index] = "Infrequent Countries"
    else:
        pass
    index = index + 1

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  MM['Country_of_Origin'][index] = "Unknown"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MM['Country_of_Origi

In [22]:
MM["Country_of_Origin"].value_counts()
  

Country_of_Origin
Unknown                                  9124
Afghanistan                              4429
Mexico                                   1565
Guatemala                                 550
Ethiopia                                  478
Honduras                                  402
Venezuela (Bolivarian Republic of)        348
Syrian Arab Republic                      347
Multiple Countries                        286
El Salvador                               194
Morocco                                   186
Algeria                                   175
Myanmar                                   170
Cuba                                      161
Haiti                                     147
Sudan                                     122
Ecuador                                   114
Nicaragua                                  83
Infrequent Countries                       82
Egypt                                      73
Dominican Republic                         70
Colombia        

In [23]:
# Create a list to show all the possible causes of death
death_list = []
index = 0
for i in MM["Cause_of_Death"]:
    if not MM["Cause_of_Death"][index] in death_list:
        death_list.append(MM["Cause_of_Death"][index])
    else:
        pass
    index = index + 1

print(death_list)

['Mixed or unknown', 'Violence', 'Harsh environmental conditions / lack of adequate shelter, food, water', 'Drowning', 'Vehicle accident / death linked to hazardous transport', 'Sickness / lack of access to adequate healthcare', 'Accidental death', 'Drowning,Mixed or unknown', 'Drowning,Harsh environmental conditions / lack of adequate shelter, food, water', 'Drowning,Vehicle accident / death linked to hazardous transport', 'Harsh environmental conditions / lack of adequate shelter, food, water,Sickness / lack of access to adequate healthcare', 'Harsh environmental conditions / lack of adequate shelter, food, water,Mixed or unknown', 'Drowning,Violence', 'Mixed or unknown,Vehicle accident / death linked to hazardous transport,Violence', 'Drowning,Sickness / lack of access to adequate healthcare', 'Accidental death,Drowning', 'Mixed or unknown,Sickness / lack of access to adequate healthcare', 'Mixed or unknown,Vehicle accident / death linked to hazardous transport']


In [24]:
# View Unbucketed Value Counts
MM['Cause_of_Death'].value_counts()

Cause_of_Death
Drowning                                                                                                                   4665
Mixed or unknown                                                                                                           4317
Vehicle accident / death linked to hazardous transport                                                                     3122
Sickness / lack of access to adequate healthcare                                                                           2590
Harsh environmental conditions / lack of adequate shelter, food, water                                                     2487
Violence                                                                                                                   2010
Accidental death                                                                                                            904
Drowning,Harsh environmental conditions / lack of adequate shelter, food, water          

In [25]:
# For Items with Few Incidents:
#Bucketing Causes of Death

index = 0
for i in MM['Cause_of_Death']:
    if "lack of adequate shelter, food, water" in MM["Cause_of_Death"][index]:
        MM['Cause_of_Death'][index] = "Lack of Shelter, Food, or Water"
    index = index + 1
    
index = 0
for i in MM['Cause_of_Death']:
    if "Mixed or unknown" in MM["Cause_of_Death"][index]:
        MM['Cause_of_Death'][index] = "Mixed or unknown"
    index = index + 1
    
index = 0
for i in MM['Cause_of_Death']:
    if "Drowning" in MM["Cause_of_Death"][index]:
        MM['Cause_of_Death'][index] = "Drowning"
    index = index + 1
    
index = 0
for i in MM['Cause_of_Death']:
    if MM['Cause_of_Death'][index] == 'Harsh environmental conditions / lack of adequate shelter, food, water':
        MM['Cause_of_Death'][index] = "Lack of Shelter, Food, or Water"
    index = index + 1
    
index = 0
for i in MM['Cause_of_Death']:
    if MM['Cause_of_Death'][index] == 'Harsh environmental conditions / lack of adequate shelter, food, water':
        MM['Cause_of_Death'][index] = "Lack of Shelter, Food, or Water"
    index = index + 1

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [26]:
# Bucketed Value Counts
MM['Cause_of_Death'].value_counts()

Cause_of_Death
Drowning                                                  4673
Mixed or unknown                                          4327
Vehicle accident / death linked to hazardous transport    3122
Sickness / lack of access to adequate healthcare          2590
Lack of Shelter, Food, or Water                           2506
Violence                                                  2010
Accidental death                                           904
Name: count, dtype: int64

In [27]:
print(MM.head())

     Incident_ID Incident_Type         Region Reported_Date  Reported_Year  \
0  2014.MMP00001      Incident  North America    2014-01-06           2014   
1  2014.MMP00002      Incident  North America    2014-01-12           2014   
2  2014.MMP00003      Incident  North America    2014-01-14           2014   
3  2014.MMP00004      Incident  North America    2014-01-16           2014   
4  2014.MMP00005      Incident         Europe    2014-01-16           2014   

  Reported_Month    Number_Dead  Minimum_Missing  Total_Dead_and_Missing  \
0        January 1.000000000000   0.000000000000                       1   
1        January 1.000000000000   0.000000000000                       1   
2        January 1.000000000000   0.000000000000                       1   
3        January 1.000000000000   0.000000000000                       1   
4        January 1.000000000000   0.000000000000                       1   

       Survivors        Females          Males  Children Country_of_Origin

In [28]:
# B. Replacing missing values 

#Replacing nan in "Cause_of_Death" with "unknown"
MM['Cause_of_Death'] = MM['Cause_of_Death'].replace(np.nan, "Unknown")

#Replacing nan in "Migration_Route" with "Not specified"
MM['Migration_Route'] = MM['Migration_Route'].replace(np.nan, "Not Specified")

#Replacing nan in "Region" with "Not specified"
MM['Region'] = MM['Region'].replace(np.nan, "Not Specified")

#Replacing the NA figures in numerical columns with 0 
MM.iloc[:, 6:13] = MM.iloc[:, 6:13].fillna(int(0))

In [29]:
print(MM.head())

     Incident_ID Incident_Type         Region Reported_Date  Reported_Year  \
0  2014.MMP00001      Incident  North America    2014-01-06           2014   
1  2014.MMP00002      Incident  North America    2014-01-12           2014   
2  2014.MMP00003      Incident  North America    2014-01-14           2014   
3  2014.MMP00004      Incident  North America    2014-01-16           2014   
4  2014.MMP00005      Incident         Europe    2014-01-16           2014   

  Reported_Month    Number_Dead  Minimum_Missing  Total_Dead_and_Missing  \
0        January 1.000000000000   0.000000000000                       1   
1        January 1.000000000000   0.000000000000                       1   
2        January 1.000000000000   0.000000000000                       1   
3        January 1.000000000000   0.000000000000                       1   
4        January 1.000000000000   0.000000000000                       1   

       Survivors        Females          Males       Children  \
0 0.00000

In [30]:
# Get Dummies to One-Hot-Encode the Causes of Death
MM = pd.get_dummies(MM, columns = ['Cause_of_Death'], prefix = "COD", prefix_sep = '_')

In [31]:
len(MM)

20132

## E: Getting Lattitude and Longitude Data

In [32]:
'''Coordinates from IOM come in a string that includes text and other characters
It also comes as longitude - latitude, so they will be flipped in the next portion
Erasing the String Data within the Coordinates Column
Convert Coordinates into a list for future processing
'''
#
index = 0
for i in MM['Coordinates']:
    #MM['Coordinates'][index][0] = MM['Coordinates'][index][0].strip(',')
    MM['Coordinates'][index] = str(MM['Coordinates'][index])
    #MM['Coordinates'][index] = MM["Coordinates"][index].strip('POINT (').strip(')')
    MM['Coordinates'][index] = MM["Coordinates"][index].strip(',')
    #MM['Coordinates'][index][0] = MM['Coordinates'][index][0].strip(',')
    MM['Coordinates'][index] = MM['Coordinates'][index].split(' ')
    index = index + 1


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [33]:
index = 0
for i in MM['Coordinates']:
    MM['Coordinates'][index][0] = MM['Coordinates'][index][0].strip(',')
    index=index+1
    
print(MM['Coordinates'])

0           [31.650259, -110.366453]
1             [31.59713, -111.73756]
2             [31.94026, -113.01125]
3           [31.506777, -109.315632]
4                      [59.1551, 28]
                    ...             
20127     [55.80967786, 27.04253439]
20128     [35.51646293, 11.07677513]
20129    [35.87939725, -5.334128056]
20130     [35.8956652, -5.309203848]
20131     [35.87415129, -5.32748803]
Name: Coordinates, Length: 20132, dtype: object


In [34]:
# Separating the coordinates of the incident into latitude and longitude, part 1:

index = 0
latlist = []
longlist = []
for items in range(0,len(MM)):
    if len(MM['Coordinates'][index]) != 2:
        MM['Coordinates'][index] = [0,0]
        #print(MM["Coordinates"][index])
        latlist.append(float(MM['Coordinates'][index][0]))
        longlist.append(float(MM['Coordinates'][index][1]))
    else:
        latlist.append(float(MM['Coordinates'][index][0]))
        longlist.append(float(MM['Coordinates'][index][1]))
    index = index + 1
        

In [35]:
# Separating the coordinates of the incident into latitude and longitude, part 2:
# Assigning the lists to columns

MM["Latitude"] = latlist
MM["Longitude"] = longlist
MM.columns

Index(['Incident_ID', 'Incident_Type', 'Region', 'Reported_Date',
       'Reported_Year', 'Reported_Month', 'Number_Dead', 'Minimum_Missing',
       'Total_Dead_and_Missing', 'Survivors', 'Females', 'Males', 'Children',
       'Country_of_Origin', 'Region_of_Origin', 'Country_of_Incident',
       'Migration_Route', 'Location_Description', 'Coordinates',
       'UNSD_Geographical_Grouping', 'Information_Source', 'URL',
       'Source_Quality', 'COD_Accidental death', 'COD_Drowning',
       'COD_Lack of Shelter, Food, or Water', 'COD_Mixed or unknown',
       'COD_Sickness / lack of access to adequate healthcare',
       'COD_Vehicle accident / death linked to hazardous transport',
       'COD_Violence', 'Latitude', 'Longitude'],
      dtype='object')

In [36]:
# Ensure that lat-long and coordinates match
MM[["Latitude", "Longitude", "Coordinates"]]

Unnamed: 0,Latitude,Longitude,Coordinates
0,31.650259000000,-110.366453000000,"[31.650259, -110.366453]"
1,31.597130000000,-111.737560000000,"[31.59713, -111.73756]"
2,31.940260000000,-113.011250000000,"[31.94026, -113.01125]"
3,31.506777000000,-109.315632000000,"[31.506777, -109.315632]"
4,59.155100000000,28.000000000000,"[59.1551, 28]"
...,...,...,...
20127,55.809677860000,27.042534390000,"[55.80967786, 27.04253439]"
20128,35.516462930000,11.076775130000,"[35.51646293, 11.07677513]"
20129,35.879397250000,-5.334128056000,"[35.87939725, -5.334128056]"
20130,35.895665200000,-5.309203848000,"[35.8956652, -5.309203848]"


In [37]:
# Coercing into float
MM['Total_Dead_and_Missing'] = MM["Total_Dead_and_Missing"].astype(float)

In [38]:
# E. Creating Log_Dead to Normalize Size of Bubbles on Maps part 1:
# Creating two empty lists to be filled later. 
deadlist = []
logdeadlist = []

# E. Creating Log_Dead to Normalize Size of Bubbles on Maps part 3:
# Creating a list of the relevant values
for i, row in MM.iterrows():
    deadlist.append(MM.at[i, "Total_Dead_and_Missing"])

# E. Creating Log_Dead to Normalize Size of Bubbles on Maps part 4:
# Logging the elements of the list
for dead in deadlist:
    logdeadlist.append(dead**(1/3)*3)
MM["Log_Dead"] = logdeadlist

In [39]:
#F. Cleaning the URL to get just one 

MM['URL'] = MM['URL'].replace(np.nan, "Not Given")

URL1 = []

for url in MM["URL"]:
    url = url.split(",")
    URL1.append(url[0])

#Assigning the value to a column
MM["URL1"] = URL1

In [40]:
#G creating variables for unknown sex and age
MM = MM.assign(Unknown_Sex = MM.Total_Dead_and_Missing -  MM.Females -  MM.Males, Unknown_Age_Status = MM.Total_Dead_and_Missing -  MM.Children)

In [41]:
print(MM.head())

     Incident_ID Incident_Type         Region Reported_Date  Reported_Year  \
0  2014.MMP00001      Incident  North America    2014-01-06           2014   
1  2014.MMP00002      Incident  North America    2014-01-12           2014   
2  2014.MMP00003      Incident  North America    2014-01-14           2014   
3  2014.MMP00004      Incident  North America    2014-01-16           2014   
4  2014.MMP00005      Incident         Europe    2014-01-16           2014   

  Reported_Month    Number_Dead  Minimum_Missing  Total_Dead_and_Missing  \
0        January 1.000000000000   0.000000000000          1.000000000000   
1        January 1.000000000000   0.000000000000          1.000000000000   
2        January 1.000000000000   0.000000000000          1.000000000000   
3        January 1.000000000000   0.000000000000          1.000000000000   
4        January 1.000000000000   0.000000000000          1.000000000000   

       Survivors        Females          Males       Children  \
0 0.00000

## F: Reverse Geocoding to Create Country Column

## Warning: 
**This process will take about two hours to run through** 

In [42]:
#H. Finding the country of the incident according to the coordinates part 1:
#Defining a function that finds the country name
'''
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="MissingMigrants")
def country_name(coord):
    location = geolocator.reverse(coord, exactly_one=True, language = "en")
    address = location.raw['address']
    city = address.get('city', '')
    state = address.get('state', '')
    country = address.get('country', '')
    return country
    '''

'\nfrom geopy.geocoders import Nominatim\ngeolocator = Nominatim(user_agent="MissingMigrants")\ndef country_name(coord):\n    location = geolocator.reverse(coord, exactly_one=True, language = "en")\n    address = location.raw[\'address\']\n    city = address.get(\'city\', \'\')\n    state = address.get(\'state\', \'\')\n    country = address.get(\'country\', \'\')\n    return country\n    '

In [None]:
#Applying the function to each of the elements in "Coordinates"
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="MissingMigrants")
def country_name(coord):
    location = geolocator.reverse(coord, exactly_one=True, language = "en")
    address = location.raw['address']
    city = address.get('city', '')
    state = address.get('state', '')
    country = address.get('country', '')
    return country

countries =[]
index = 0
for i in MM["Coordinates"]:
    try:
        countries.append(country_name([MM['Latitude'][index], MM['Longitude'][index]]))
    except:
        countries.append("International Waters")
        
    index = index + 1
    

In [None]:
#H. Finding the country of the incident according to the coordinates part 2:
#Applying the function to each of the elements in "Coordinates"
#Storing country name in a list.
'''
countries =[]
index = 0
for i in MM["Coordinates"]:
    try:
        countries.append(country_name([MM['Latitude'][index], MM['Longitude'][index]]))
    except:
        countries.append("International Waters")
        
    index = index + 1
'''    

In [None]:
#H. Finding the country of the incident according to the coordinates part 3:
#Looking at how many coordinates were paired with countries. It should be close to 8000.
print(len(countries))

In [None]:
#H. Finding the country of the incident according to the coordinates part 4:
#Assigning the values to a new column

MM["Country"] = countries
MM['Country'] = MM['Country'].replace(np.nan, "International Waters")

In [None]:
# Second Check
# The function sometimes returned 0.0 as an error
# So checking to see if it is a float

index = 0
for i in MM["Country"]:
    if type(MM['Country'][index]) == float:
        MM['Country'][index] = "Not Found"
    else:
        pass
    index = index + 1


In [None]:
index = -1
for i in MM['Country']:
    index = index + 1
    if MM['Country'][index] == "":
        MM['Country'][index] = "Not Found"
    else:
        pass

In [None]:
index = -1
for i in MM['Country']:
    index = index + 1
    MM['Country'][index] = str(MM['Country'][index])
    print(MM['Country'][index])

United States
United States
United States
Spain
United States
Mexico
United States
Mexico
Mexico
Mexico
Mexico
Mauritania
Italy
Libya
United States
Libya
Lebanon
Spain
Mexico
Mexico
Italy
Trinidad and Tobago
United States
United States
United States
United States
United States
Morocco
International Waters
Lebanon
Spain
Turkey
United States
Spain
Indonesia
United States
Indonesia
United States
United States
Gibraltar
Indonesia
Ecuador
United States
The Bahamas
Mexico
United States
United States
Colombia
Mexico
Mexico
Mexico
Turkey
International Waters
United States
International Waters
Mexico
United States
United States
United States
United States
United States
Algeria
Algeria
International Waters
Algeria
Turkey
Italy
United States
United States
United States
United States
United States
Libya
Turkey
Libya
Libya
Algeria
Turkey
Malaysia
United States
Somalia
United States
United States
United States
United States
United States
United States
Libya
United States
United States
Libya
North Ko

Libya
United States
International Waters
Spain
Algeria
Colombia
Panama
Libya
Libya
Algeria
United States
United States
United States
Mexico
Libya
Tunisia
Morocco
Algeria
Mexico
United States
Mexico
Mexico
Mexico
Mexico
Turkey
Libya
Libya
Libya
Libya
Libya
Libya
Libya
Libya
Libya
United States
Mexico
United States
Mexico
Mexico
Colombia
Libya
Turkey
Libya
Mexico
International Waters
Libya
Libya
Tunisia
Spain
International Waters
United States
United States
Algeria
Morocco
Morocco
Libya
Libya
Libya
Libya
Libya
Libya
Turkey
Turkey
United States
Colombia
Colombia
United States
Mexico
France
United States
United States
United States
Colombia
Bolivia
United States
Tunisia
Tunisia
Tunisia
International Waters
Spain
Spain
Libya
Libya
Morocco
United States
United States
Panama
United States
Mexico
Mexico
Libya
Libya
Bosnia and Herzegovina
Mexico
Morocco
Mexico
Mexico
Mexico
Mexico
Mexico
Spain
Tunisia
Mexico
United States
Bangladesh
Spain
Libya
International Waters
Libya
Libya
Panama
Chile
Chil

Morocco
France
United States
Mexico
Panama
Mexico
Chile
Mexico
Colombia
Italy
Serbia
Belarus
Poland
Spain
International Waters
International Waters
Poland
Turkey
Iran
Somalia
Poland
Libya
Belarus
Djibouti
Morocco
International Waters
International Waters
International Waters
France
Spain
Spain
United States
United States
Colombia
Colombia
Mexico
United States
United States
United States
Chile
Colombia
Mexico
Serbia
Morocco
International Waters
International Waters
United States
Serbia
Belarus
Belarus
Turks and Caicos Islands
Thailand
Italy
Spain
Spain
Spain
International Waters
Turkey
Dominican Republic
Mexico
United States
United States
Tunisia
Tunisia
Poland
Bolivia
Germany
Germany
Slovenia
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
Unit

International Waters
Morocco
Afghanistan
Spain
Ukraine
Libya
Libya
United States
United States
Costa Rica
Mexico
United States
United States
Mexico
Algeria
United States
United States
Mexico
Colombia
Colombia
Chile
Chile
International Waters
United States
United States
United States
International Waters
Libya
Libya
Spain
Slovenia
Spain
Italy
Slovenia
Greece
Bulgaria
Romania
Morocco
Mexico
Dominican Republic
United States
United States
United States
Mexico
Tunisia
Tunisia
Libya
Libya
Morocco
Turkey
Spain
United States
International Waters
Mexico
Dominican Republic
Guatemala
Mexico
United States
Mexico
United States
United States
Algeria
Algeria
Algeria
Algeria
Algeria
Algeria
Algeria
Mali
Algeria
Libya
Myanmar
Croatia
Greece
Dominican Republic
Mexico
United States
United States
Mexico
United States
United States
United States
United States
United States
Mexico
United States
Mexico
Mexico
Mexico
Mexico
United States
Malaysia
Malaysia
Lebanon
Tunisia
Libya
International Waters
Turkey
Inte

International Waters
Libya
International Waters
Turkey
Malawi
Mexico
Mexico
Morocco
Algeria
Malawi
Tunisia
Libya
Morocco
Italy
Italy
International Waters
Algeria
Turkey
Turkey
Turkey
Turkey
Panama
United States
United States
Mexico
United States
Mexico
Honduras
Costa Rica
Chile
United States
United States
Cuba
Morocco
Morocco
Spain
Yemen
Germany
Algeria
Libya
Greece
Greece
Libya
Libya
Spain
Djibouti
Djibouti
Djibouti
Somalia
Myanmar
International Waters
Jordan
Mexico
United States
Panama
Panama
Croatia
Tunisia
Italy
Italy
Tunisia
Greece
Greece
Libya
Algeria
Morocco
International Waters
United States
United States
United States
United States
The Bahamas
Mexico
Mexico
Mexico
Nicaragua
United States
Mexico
Mexico
Mexico
United States
Mexico
United States
United States
Cuba
Mexico
United States
Cuba
International Waters
Greece
Italy
Belarus
Libya
Belarus
International Waters
Mexico
Spain
Mexico
United States
United States
United States
International Waters
Mexico
Chile
Mexico
Mexico
United

Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
Afghanistan
International Waters
Algeria
Morocco
United States
Spain
Algeria
Spain
Algeria
Algeria
Spain
Algeria
Algeria
Morocco
Algeria
Algeria
Morocco
Algeria
Algeria
Morocco
International Waters
Algeria
Spain
International Waters
Algeria
Alg

Mexico
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
United States
Lebanon
Libya
Libya
Libya
Libya
Libya
Libya
Libya
Spain
United States
United States
Mexico
Canada
Mexico
United States
Mexico
Costa Rica
Turkey
Bulgaria
Morocco
Morocco
Senegal
Italy
Tunisia
Italy
Italy
International Waters
United States
Mexico
Colombia
Mexico
Mexico
Mexico
Mexico
Guyana
Mexico
Mexico
United States
Tunisia
Libya
Spain
International Waters
Morocco
Algeria
Algeria
Spain
International Waters
Hungary
International Waters
International Waters
Mexico
Mexico
Mexico
United States
Mexico
The Bahamas
Mexico
Dominican Republic
France
Myanmar
France
Mexico
United States
Mexico
United States
United States
United States
Dominican Republic
Mexico
Mexico
International Waters
International Waters
Italy
Somalia
Somalia
Somalia
Somalia
Somalia
Somalia
Djibouti
Djibouti
Djibouti
Djibouti
Djibouti
Eritrea
Internationa

In [None]:
#Check to see if value counts make any sense
MM['Country'].value_counts()

In [None]:
# Fixing the Datatype of Certain Columns and creating columns for Age Status Breakdown
for i in range (0,len(MM['Country'])):
    MM['Country'][i] = str(MM['Country'][i])

In [None]:
new_col = ['Incident_ID', 'Incident_Type', 'Region', 'Reported_Date',
       'Reported_Year', 'Reported_Month', 'Number_Dead', 'Minimum_Missing',
       'Total_Dead_and_Missing', 'Survivors', 'Females', 'Males', 'Children', "Country of Origin",
       'Region of Origin', 'Country of Incident', 'Migration Route',
       'Location Description', 'Coordinates', 'UNSD_Geographical_Grouping',
       'Info Source', 'URL', 'Source Quality', 'Other Accidents',
       'Drowning', 'Lack of Shelter, Food, or Water',
       'Mixed or unknown',
       'Sickness',
       'Transportation Accident',
       'Violence', 'Latitude', 'Longitude', 'Log_Dead', 'URL1',
       'Unknown_Sex', 'Unknown_Age_Status', 'Country']

In [None]:
MM.columns = new_col

In [None]:
for i in range (0,len(MM['Total_Dead_and_Missing'])):
    if type(MM['Total_Dead_and_Missing'][i]) == str:
        if ',' in MM['Total_Dead_and_Missing'][i]:
            MM['Total_Dead_and_Missing'][i] = MM['Total_Dead_and_Missing'][i].replace(",", "")
        else:
            pass
    else:
        pass
    
    MM['Total_Dead_and_Missing'][i] = float(MM['Total_Dead_and_Missing'][i])

confirmed_adults = []
index = 0
for i in MM['Children']:
    MM['Males'][index] = float(MM['Males'][index])
    MM['Females'][index] = float(MM['Females'][index])
    MM['Children'][index] = float(MM['Children'][index])
    MM['Unknown_Age_Status'][index] = float(MM['Unknown_Age_Status'][index])
    #digit = MM['Males'][index] + MM['Females'][index] - MM['Children'][index] - MM['Unknown_Age_Status'][index]
    #confirmed_adults.append[digit]
    index = index + 1
    
confirmed_adults = []
index = 0
for i in MM['Children']:
    if MM["Unknown_Age_Status"][index] != 0:
        digit = MM['Males'][index] + MM['Females'][index] - MM['Children'][index]
        confirmed_adults.append(digit)
    else:
        confirmed_adults.append(0)
    index = index + 1

index = 0
Children_Sum = []
for i in MM['Children']:
    Children_Sum.append(float(MM['Children'][index]))
    index = index + 1
    
index = 0
Total_Minus_Unknown = []
for i in MM['Children']:
    Total_Minus_Unknown.append(float(MM['Total_Dead_and_Missing'][index]) - float((MM['Unknown_Age_Status'][index])))
    index = index + 1

In [None]:
print(sum(Children_Sum))
print(sum(Total_Minus_Unknown))
print(sum(confirmed_adults))
print(sum(MM['Unknown_Age_Status']))

In [None]:
MM['Confirmed_Adults'] = confirmed_adults

In [None]:
#Format the date to datetime
import datetime
# Date parser to pass to read_csv
d = lambda x: pd.datetime.strptime(x, '%d-%b-%y')
dt_list = []
index = 0
month_key = {"January": "01",
            "February": "02",
            "March" : "03",
            "April" : "04",
            "May": "05",
            "June": "06",
            "July": "07",
            "August": "08",
            "September": "09",
            "October": "10",
            "November": "11",
            "December": "12"}

for i in MM['Reported_Year']:
    dt_list.append(str(MM['Reported_Year'][index])+ '-' + month_key[str(MM["Reported_Month"][index])]+ '-' + "01")
    index = index + 1
    
MM['Date'] = pd.to_datetime(dt_list)

In [None]:
#Find your current working directory
#import os
#os.getcwd()

In [None]:
MM.to_csv(r'/Users/liamcomerweaver/Documents/Coding/Python Scripts/CSVs/MM_Dummies_OCT25.csv', index=False)
