In [None]:
# Religious Migration Analysis 1990-2020
# Author: Kristofer O'Garro
# Date: 11/21/2024
# Data Source: Pew Research Center (https://www.pewresearch.org/dataset/dataset-religious-composition-of-the-worlds-migrants-1990-2020/)

"""
This analysis explores global religious migration patterns from 1990-2020, 
examining how migration flows have influenced religious demographics across different 
geographical scales.

Dataset Structure:
- 26712 rows
- 9 columns
- Hierarchical data (Country -> Regional -> Global levels)
- Time period: 1990-2020 (5-year intervals)
"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [3]:
migration = pd.read_csv('data/Incoming_and_Outgoing_Migrant_Counts.csv')
migration.head()

Unnamed: 0,Direction,Year,Country,Religion,Count,Percent,Region,level,countrycode
0,Incoming,1990,Global Total,All,152970000,100.0,Global,3,9999
1,Incoming,1990,Global Total,Buddhist,4590000,3.0,Global,3,9999
2,Incoming,1990,Global Total,Christian,72710000,47.5,Global,3,9999
3,Incoming,1990,Global Total,Hindu,9130000,6.0,Global,3,9999
4,Incoming,1990,Global Total,Jew,2340000,1.5,Global,3,9999


In [5]:
# Initial data exploration
def explore_data_quality(df):
    """
    Perform initial data quality checks.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe to analyze
        
    Returns:
    --------
    dict
        Dictionary containing data quality metrics
    """
    quality_report = {
        'missing_values': df.isnull().sum(),
        'unique_values': {col: df[col].nunique() for col in df.columns},
        'data_types': df.dtypes
    }
    return quality_report

In [7]:
explore_data_quality(migration)

{'missing_values': Direction      0
 Year           0
 Country        0
 Religion       0
 Count          0
 Percent        0
 Region         0
 level          0
 countrycode    0
 dtype: int64,
 'unique_values': {'Direction': 2,
  'Year': 7,
  'Country': 239,
  'Religion': 8,
  'Count': 834,
  'Percent': 955,
  'Region': 7,
  'level': 3,
  'countrycode': 239},
 'data_types': Direction       object
 Year             int64
 Country         object
 Religion        object
 Count           object
 Percent        float64
 Region          object
 level            int64
 countrycode      int64
 dtype: object}

In [9]:
#Changing the following column type to 'category' for memory optimization: Direction, Religion, Region, Level 
migration = migration.astype({
    'Direction': 'category',
    'Religion': 'category',
    'Region': 'category',
    'level': 'category',
})

migration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26712 entries, 0 to 26711
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Direction    26712 non-null  category
 1   Year         26712 non-null  int64   
 2   Country      26712 non-null  object  
 3   Religion     26712 non-null  category
 4   Count        26712 non-null  object  
 5   Percent      26712 non-null  float64 
 6   Region       26712 non-null  category
 7   level        26712 non-null  category
 8   countrycode  26712 non-null  int64   
dtypes: category(4), float64(1), int64(2), object(2)
memory usage: 1.1+ MB


In [11]:
#Standardized column names to lowercase
migration.columns = migration.columns.str.lower()

In [None]:
#Adding this as a test. Will remove shortly