In [1]:
# Religious Migration Analysis 1990-2020
# Author: Kristofer O'Garro
# Date: 11/21/2024
# Data Source: Pew Research Center (https://www.pewresearch.org/dataset/dataset-religious-composition-of-the-worlds-migrants-1990-2020/)

"""
This analysis explores global religious migration patterns from 1990-2020, 
examining how migration flows have influenced religious demographics across different 
geographical scales.

Dataset Structure:
- 26712 rows
- 9 columns
- Hierarchical data (Country -> Regional -> Global levels)
- Time period: 1990-2020 (5-year intervals)
"""

'\nThis analysis explores global religious migration patterns from 1990-2020, \nexamining how migration flows have influenced religious demographics across different \ngeographical scales.\n\nDataset Structure:\n- 26712 rows\n- 9 columns\n- Hierarchical data (Country -> Regional -> Global levels)\n- Time period: 1990-2020 (5-year intervals)\n'

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [3]:
migration = pd.read_csv('data/Incoming_and_Outgoing_Migrant_Counts.csv', thousands=',')
migration

Unnamed: 0,Direction,Year,Country,Religion,Count,Percent,Region,level,countrycode
0,Incoming,1990,Global Total,All,152970000,100.0,Global,3,9999
1,Incoming,1990,Global Total,Buddhist,4590000,3.0,Global,3,9999
2,Incoming,1990,Global Total,Christian,72710000,47.5,Global,3,9999
3,Incoming,1990,Global Total,Hindu,9130000,6.0,Global,3,9999
4,Incoming,1990,Global Total,Jew,2340000,1.5,Global,3,9999
...,...,...,...,...,...,...,...,...,...
26707,Outgoing,2020,Zambia,Hindu,"< 10,000",2.1,Sub-Saharan Africa,1,894
26708,Outgoing,2020,Zambia,Jew,"< 10,000",0.1,Sub-Saharan Africa,1,894
26709,Outgoing,2020,Zambia,Muslim,20000,10.4,Sub-Saharan Africa,1,894
26710,Outgoing,2020,Zambia,Other,"< 10,000",3.7,Sub-Saharan Africa,1,894


## Cleaning the data

In [5]:
# Initial data exploration
def explore_data_quality(df):
    """
    Perform initial data quality checks.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe to analyze
        
    Returns:
    --------
    dict
        Dictionary containing data quality metrics
    """
    quality_report = {
        'missing_values': df.isnull().sum(),
        'unique_values': {col: df[col].nunique() for col in df.columns},
        'data_types': df.dtypes
    }
    return quality_report

In [6]:
explore_data_quality(migration)

{'missing_values': Direction      0
 Year           0
 Country        0
 Religion       0
 Count          0
 Percent        0
 Region         0
 level          0
 countrycode    0
 dtype: int64,
 'unique_values': {'Direction': 2,
  'Year': 7,
  'Country': 239,
  'Religion': 8,
  'Count': 834,
  'Percent': 955,
  'Region': 7,
  'level': 3,
  'countrycode': 239},
 'data_types': Direction       object
 Year             int64
 Country         object
 Religion        object
 Count           object
 Percent        float64
 Region          object
 level            int64
 countrycode      int64
 dtype: object}

In [7]:
#Changing the following column type to 'category' for memory optimization: Direction, Religion, Region, Level 
migration = migration.astype({
    'Direction': 'category',
    'Religion': 'category',
    'Region': 'category',
    'level': 'category',
})

migration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26712 entries, 0 to 26711
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Direction    26712 non-null  category
 1   Year         26712 non-null  int64   
 2   Country      26712 non-null  object  
 3   Religion     26712 non-null  category
 4   Count        26712 non-null  object  
 5   Percent      26712 non-null  float64 
 6   Region       26712 non-null  category
 7   level        26712 non-null  category
 8   countrycode  26712 non-null  int64   
dtypes: category(4), float64(1), int64(2), object(2)
memory usage: 1.1+ MB


In [8]:
#Standardized column names to lowercase
migration.columns = migration.columns.str.lower()

In [9]:
migration['count']

0        152,970,000
1          4,590,000
2         72,710,000
3          9,130,000
4          2,340,000
            ...     
26707       < 10,000
26708       < 10,000
26709         20,000
26710       < 10,000
26711         10,000
Name: count, Length: 26712, dtype: object

In [10]:
# Replacing values of '< 10,000' with 5,000 for numerical calculations. The number 5000 was chosen as a midpoint since we
# don't have the actual numbers
migration['count'] = migration['count'].replace('< 10,000', '5000')
migration['count']

0        152,970,000
1          4,590,000
2         72,710,000
3          9,130,000
4          2,340,000
            ...     
26707           5000
26708           5000
26709         20,000
26710           5000
26711         10,000
Name: count, Length: 26712, dtype: object

In [11]:
migration.dtypes

direction      category
year              int64
country          object
religion       category
count            object
percent         float64
region         category
level          category
countrycode       int64
dtype: object

In [39]:
migration['count'] = migration['count'].str.replace(',', '')
migration['count'] = pd.to_numeric(migration['count'])

In [41]:
migration.dtypes

direction      category
year              int64
country          object
religion       category
count             int64
percent         float64
region         category
level          category
countrycode       int64
dtype: object

## Global Analysis

In [43]:
migration.head()

Unnamed: 0,direction,year,country,religion,count,percent,region,level,countrycode
0,Incoming,1990,Global Total,All,152970000,100.0,Global,3,9999
1,Incoming,1990,Global Total,Buddhist,4590000,3.0,Global,3,9999
2,Incoming,1990,Global Total,Christian,72710000,47.5,Global,3,9999
3,Incoming,1990,Global Total,Hindu,9130000,6.0,Global,3,9999
4,Incoming,1990,Global Total,Jew,2340000,1.5,Global,3,9999


In [49]:
global_data = migration[migration['level'] == 3]
global_data['count']

0        152970000
1          4590000
2         72710000
3          9130000
4          2340000
           ...    
24803     13480000
24804      2980000
24805     80420000
24806      5510000
24807     36400000
Name: count, Length: 112, dtype: int64

In [53]:
global_religion = global_data.groupby(['year', 'religion'], observed=True)['count'].sum()
global_religion

year  religion    
1990  All             305940000
      Buddhist          9180000
      Christian       145420000
      Hindu            18260000
      Jew               4680000
      Muslim           79760000
      Other             5060000
      Unaffiliated     43580000
1995  All             322540000
      Buddhist         10500000
      Christian       157620000
      Hindu            18260000
      Jew               4940000
      Muslim           78780000
      Other             5500000
      Unaffiliated     46940000
2000  All             346440000
      Buddhist         12100000
      Christian       169560000
      Hindu            18660000
      Jew               5100000
      Muslim           84260000
      Other             5860000
      Unaffiliated     50900000
2005  All             382860000
      Buddhist         14840000
      Christian       188120000
      Hindu            19820000
      Jew               5280000
      Muslim           92240000
      Other          

In [63]:
pivoted_data = global_religion.reset_index().pivot(index='religion', columns='year', values='count')
pivoted_data

year,1990,1995,2000,2005,2010,2015,2020
religion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
All,305940000,322540000,346440000,382860000,441920000,495860000,561139968
Buddhist,9180000,10500000,12100000,14840000,18180000,20020000,21800000
Christian,145420000,157620000,169560000,188120000,212000000,230320000,261760000
Hindu,18260000,18260000,18660000,19820000,22600000,25220000,26960000
Jew,4680000,4940000,5100000,5280000,5540000,5740000,5960000
Muslim,79760000,78780000,84260000,92240000,114540000,139460000,160840000
Other,5060000,5500000,5860000,6860000,8040000,9380000,11020000
Unaffiliated,43580000,46940000,50900000,55700000,61020000,65720000,72800000
