# Data preparation

In this notebook, one can find the steps I used to prepare the data.
The original data is in zip files in .sav format. I clean some of the unwanted columns here and save the data on year basis in pickled dataframes. 

#### Imports

In [None]:
# imports
import pandas as pd
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

import dataprep

source = "meta/columns"
df_meta = pd.read_pickle(source)

In [None]:
# Choose here the source file to use
# num = 1 # 2008-2012
# num = 2 # 2013-2017
# num = 3 # 2018-2022

nums = [1, 2, 3]

#### Reading the files

In [None]:
for num in tqdm(nums): 
    print(f"########## Processing file {num} ##########")   
    # Reading the files
    df, meta = dataprep.read_file(num)

    # Data preparation

    # Regions
    df = dataprep.new_region_column(df)

    # Renaming the features
    df = dataprep.rename_with_codes(df, meta)

    # Removing unwanted columns
    df = dataprep.remove_unwanted(df, df_meta)

    # Remove questions which are not asked in all countries. 
    df = dataprep.remove_notallcountry(df)
    try:
        df.drop('INDEX_CA: Community Attachment Index', axis=1, inplace = True)
    except:
        pass

    # Save files to pickles
    dataprep.save_files(df, num, 'clean_per_year', '')

#### Further preparation: Migration aspiration

In [None]:
for num in tqdm(nums):
    print(f"########## Processing file {num} ##########")   
    # reading files
    if num == 1:
        source = "gwp_data/clean_per_year/clean_data_from8to12_"
    if num == 2:
        source = "gwp_data/clean_per_year/clean_data_from13to17_"
    if num == 3:
        source = "gwp_data/clean_per_year/clean_data_from18to22_"
        
    df_aspiration = pd.read_pickle(source)

    # Check for duplicate columns and keep only the columns that are not duplicates
    duplicated_columns = df_aspiration.columns.duplicated()
    df_aspiration = df_aspiration.loc[:, ~duplicated_columns]

    # remove lines where the answer is not yes or no
    df_aspiration = dataprep.remove_aspiration_DK(df_aspiration)

    # impute values by type
    df_aspiration = dataprep.impute_missing_by_type(df_aspiration, df_meta)

    print(f"Shape: {df_aspiration.shape}")
    # Check which columns have missing values
    columns_with_missing_values = df_aspiration.columns[df_aspiration.isnull().any()]

    print(f"Shape before dropna: {df_aspiration.shape}")
    df_aspiration.dropna(inplace=True)

    print(f"Shape before sampling: {df_aspiration.shape}")
    df_aspiration = dataprep.sampling(df_aspiration)

    df_aspiration['index'] = (range(len(df_aspiration['WP1220: Age'])))
    df_aspiration.set_index('index', inplace=True)

    print(f"Shape of the dataframe: {df_aspiration.shape}")

    # save files
    dataprep.save_files(df_aspiration, num, 'prepared_aspiration', '')

    df_aspiration_without_regions = df_aspiration.drop(["REG2_GLOBAL: Region 2 Global","REG_GLOBAL: Global Region"], axis=1)

    # save files without regions
    dataprep.save_files(df_aspiration_without_regions, num, 'prepared_aspiration', 'woregions')

#### Further preparation: migration destination

In [None]:
for num in tqdm(nums):
    print(f"########## Processing file {num} ##########")   
    # reading files
    if num == 1:
        source = "gwp_data/clean_per_year/clean_data_from8to12_"
    if num == 2:
        source = "gwp_data/clean_per_year/clean_data_from13to17_"
    if num == 3:
        source = "gwp_data/clean_per_year/clean_data_from18to22_"    
    df_destination = pd.read_pickle(source)

    # Check for duplicate columns and keep only the columns that are not duplicates
    duplicated_columns = df_destination.columns.duplicated()
    df_destination = df_destination.loc[:, ~duplicated_columns]

    region_cols = ['REG2_GLOBAL: Region 2 Global', 'REG_GLOBAL: Global Region']
    df_destination.drop(columns = region_cols, axis =1,  inplace = True)

    # remove rows where destination is missing
    df_destination = df_destination[df_destination['WP3120: Country Would Move To'].notnull()]

    # impute values by type
    df_destination = dataprep.impute_missing_by_type(df_destination, df_meta)

    # some columns are not imputed
    df_destination = df_destination[df_destination['WP1220: Age'].notna()]

    dataprep.save_files(df_destination, num, 'prepared_destination', '')

#### Merge files

In [None]:
# merge aspiration for each year
df_asp = pd.DataFrame()
df_asp_woregions = pd.DataFrame()
df_dest = pd.DataFrame()
# df_clean = pd.DataFrame()

for year in range(2008, 2023):

    df_asp_year = pd.read_pickle(f"gwp_data/prepared_aspiration/clean_data_{year}_")
    df_asp_year_woregions = pd.read_pickle(f"gwp_data/prepared_aspiration/clean_data_{year}_woregions")
    df_dest_year = pd.read_pickle(f"gwp_data/prepared_destination/clean_data_{year}_")

    df_asp = pd.concat([df_asp, df_asp_year], ignore_index=True)
    df_asp_woregions = pd.concat([df_asp_woregions, df_asp_year_woregions], ignore_index=True)
    df_dest = pd.concat([df_dest, df_dest_year], ignore_index=True)

df_asp.to_pickle('gwp_data/prepared_aspiration/full_aspiration')
df_asp_woregions.to_pickle('gwp_data/prepared_aspiration/full_aspiration_woregions')
df_dest.to_pickle('gwp_data/prepared_destination/full_destination')