In [None]:
# install necessary packages 

!pip install wbdata
import wbdata # IF NECESSARY
import cufflinks as cf
cf.go_offline()
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px

# DEFINE FUNCTIONS

# this one is to get relevant migration statistics we're interested in
def migration_organizing(data1):
    
    # Make years ints instead of strings
    data1.reset_index(inplace=True)
    data1['date'] = data1['date'].astype(int)

    # index the table by country instead of year 
    data1.set_index(['country'],inplace=True)
    
    #takes all negative values, we want migration AWAY
    mig_table = data1[data1['Net Migration'] < 0] 
    
    #takes absolute value to get magnitude rather than negative
    data1['Net Migration'] = data1['Net Migration'].abs() 
    
    #creates new column, called 'Migration Per Capita'
    data1['Migration per Capita'] = data1['Net Migration']/data1['Total Population'] 
    
    #takes migration per capita and makes it a rate
    data1['Migration Rate (%)'] = data1['Migration per Capita']*100   
    
    # This line of code is initialized because we are interested in a population contingent 
    # on trends in the past ten years (rather than since '64). Additionally, we look at data 
    # from the ESG dataset, which only started being compiled in 2014'''
    
    #mig_table = mig_table[mig_table['date'] > 2013]
    
    # calculate the percentiles for the migration rates 

    return data1



''' This function is a manual way of creating a dataframe that is usually performed by 
    a "group_by" function. The reason for this distinction is because we used the group_by
    function, but were really struggling to perform table-operations on the group_by frame, 
    as that function creates a special type of dataframe. This method was much faster.'''

def setup_long(dataframe):
    
    #percentiles = {'values': [50, 75, 80, 90]}
    unique_country_indices = dataframe.index.unique()

    if not unique_country_indices.empty:
        
        # Create an empty DataFrame with columns
        results = pd.DataFrame(columns=['Net Migration', 'Migration Rate (%)'])

        for country_index in unique_country_indices:
            country_data = dataframe.loc[country_index]
            mig_net_avg = country_data['Net Migration'].mean()
            mig_percap_avg = country_data['Migration Rate (%)'].mean()

            # Append the computed averages to the results DataFrame
            results.loc[country_index] = [mig_net_avg, mig_percap_avg]
        
        # calculate the percentiles for the migration rates 
        results['Percentile Rank'] = results['Migration Rate (%)'].apply(
            lambda x: stats.percentileofscore(results['Migration Rate (%)'], x))

        return results 
    
    
    
''' This is the function that we use to retrieve population statistics, as outlined in the 
    [A] deliverables posted on Ed.'''


def population(year, sex, age_low, age_high, country_code):
    
    # reconstruct the strings for population codes associated 
    # as entered in "age_low" and "age_high" arguments
    
    if sex == "Male":
        column_names = {"SP.POP." + str(age_low)+str(age_high) + ".MA": sex}
    elif sex == "Female":
        column_names = {"SP.POP." + str(age_low)+str(age_high) + ".FE": sex}
    
    # construct new dataframe for function to index, isolating 
    # the country by the function's country-code argument
    pop_stats = wbdata.get_dataframe(column_names, country = country_code)
    
    # filter the table by the function's year' argument
    
    pop_stats = pop_stats.filter(like=str(year), axis=0)
    # return population number by indexing the function-generated 
    # dataframe by the function's 'sex' argument, and making it an integer

    return int(pop_stats[sex].iloc[0])



''' This function retrieves a dataframe for specific year, country, and indicators selected. 
    The function assumes that the argument pop_indicators has already been defined with a 
    relevant WBData code dictionary.'''

def population_dataframe(year, country_code, pop_indicators):
        
    pop_df = wbdata.get_dataframe(pop_indicators, country = country_code)
    
    # filter the table by the function's year' argument
    
    pop_df = pop_df.filter(like=str(year), axis=0)
    
    # return population dataframe by indexing  
    # by the function's 'sex' argument

    return pop_df

In [None]:
indicators_new = {"SP.POP.TOTL": "Total Population", "SM.POP.NETM": "Net Migration","AG.PRD.FOOD.XD": "Food Production Index",
                 "EN.POP.DNST": "Population per sq km","SN.ITK.DEFC.ZS": "Prevalence of Undernourishment (% of Population)",
                 "EG.ELC.COAL.ZS": "Electricity production from coal sources","SM.POP.NETM": "Net Migration",
                  "EN.H2O.BDYS.ZS": "Proportion of bodies of water with good ambient water quality", 
                  "ER.H2O.FWTL.ZS": "Annual freshwater withdrawals, total (% of internal resources)",
                 "SH.H2O.SMDW.ZS": "People using safely managed drinking water services (% of population)", 
                  "SH.STA.SMSS.ZS": "People using safely managed sanitation services (% of population)", 
                  "SH.MED.BEDS.ZS": "Hospital beds (per 1,000 people)",  
                  "SE.XPD.TOTL.GB.ZS": "Government Expenditure on Education (% of Total Expenditure)",
                 "PV.EST": "Political Stability and Absence of Violence/Terrorism (estimate)", 
                      "IC.LGL.CRED.XQ": "Strength of Legal Rights Index (0-12 Scale)", 
                    "EG.ELC.ACCS.ZS": "Access to electricity (% of population)",
                 "GE.EST": "Government Effectiveness Estimate", 
                    "CC.EST": "Control of Corruption: Estimate", 
                    "SI.DST.FRST.20": "Income Share Held by Lowest 20%",
                 "RL.EST": "Rule of Law: Estimate",
                 "EN.LND.LTMP.DC": "Land Surface Temperature",
                  "EN.ATM.PM25.MC.M3": "PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)", 
                  "EN.CLC.CDDY.XD": "Cooling Degree Days", 
                  "EN.CLC.HEAT.XD": "Heat Index 35"}

# these do not work                
#"EN.ATM.CO2E.PCCO2": "emissions (metric tons per capita)", 
#"EN.ATM.METH.PC": "Methane emissions (kt of CO2 equivalent per capita)",
#"EN.ATM.NOXE.PC": "Nitrous oxide emissions (metric tons of CO2 equivalent per capita)",
                  



indicators_reduced = wbdata.get_dataframe(indicators_new, country='all')

In [None]:
import cufflinks as cf
cf.go_offline()

In [None]:
percap = migration_organizing(indicators_reduced)
percap['Percentile Rank'] = percap['Migration Rate (%)'].apply(lambda x: 
                            stats.percentileofscore(percap['Migration Rate (%)'], x, nan_policy='omit'))
post_2000 = percap[percap['date'] > 2000]

#post_2000.reset_index(inplace=False)
no_CUW = percap.drop(index='Kuwait')

In [None]:
columns = ['Prevalence of Undernourishment (% of Population)',
           'People using safely managed drinking water services (% of population)',
           'People using safely managed sanitation services (% of population)',
           'Hospital beds (per 1,000 people)',
           'Strength of Legal Rights Index (0-12 Scale)',
           'Political Stability and Absence of Violence/Terrorism (estimate)',
           'Access to electricity (% of population)', 
           'Control of Corruption: Estimate', 'Income Share Held by Lowest 20%', 
           'Land Surface Temperature','PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)', 
           'Cooling Degree Days']
    
    
post_2000

In [None]:
# plot control of corruption estimate 

percap.iplot(kind='scatter', mode='markers', symbol='circle-dot',
         x="Migration Rate (%)",y="Rule of Law: Estimate",
         text=percap.reset_index('country')['country'].values.tolist(),
         xTitle="Migration Rate (%)",yTitle="Rule of Law: Estimate",
         title="Rule of Law", xrange=[0,30])

In [None]:
high_mig_countries = ['Curacao',
 'Marshall Islands',
 'Qatar',
 'Syrian Arab Republic',
 'American Samoa',
 'Lebanon',
 'South Sudan',
 #'Kuwait',
 'St. Martin (French part)',
 'Venezuela, RB',
 'Oman',
 'Tonga',
 'Saudi Arabia',
 'Kosovo',
 'Northern Mariana Islands',
 'Moldova',
 'Central African Republic',
 'Bahrain',
 'Guyana',
 'Samoa']

In [None]:
#plot all of the columns versus migration rate 

def scatter_all(frame):
    columns = frame.columns
    for column in columns: 
        frame.iplot(kind='scatter', mode='markers', symbol='circle-dot',
         y="Migration Rate (%)",x=str(column), 
         text=frame.reset_index('country')['country'].values.tolist(),
         yTitle="Migration Rate (%)",xTitle=str(column), 
         categories= 'date', 
         yrange=[0,4],
         title= str(column)+' versus Migration Rate (%)')

In [None]:
post_2000.reset_index(inplace=True)
post_2000.columns

In [None]:
import plotly.io as pio
#post_2000['date'] = post_2000['date'].astype(int)

#percap.reset_index(inplace=True)

fig_mig_world = px.line(percap, x='date', y='Migration Rate (%)', color = 'country', labels = {'date': 'Year'},
                  title = 'Migration Rates Over Time [WORLD]', width=1200, height=600)
pio.write_image(fig_mig_world, 'world_migovertime.png',scale=4)
fig_mig_world



In [None]:
#no_CUW.reset_index(inplace=True)
mig_world_no_CUW = px.line(no_CUW, x='date', y='Migration Rate (%)', color = 'country', labels = {'date': 'Year'},
                                title = 'Migration Rates Over Time [Drop Outlier]', width=1200, height=600)
#pio.write_image(fig_mig_world, 'world_migovertime.png',scale=4)

pio.write_image(mig_world_no_CUW, 'world_migovertime_dropout.png',scale=4)
mig_world_no_CUW

In [None]:
highmig = percap.loc[high_mig_countries]
#highmig.reset_index(inplace=True)

In [None]:
highmig_means = highmig.groupby(['date']).mean()
highmig['mean mig'] = highmig_means['Migration Rate (%)']

In [None]:
#fig = post_2000_highmig.iplot(kind='line', x='date', y='Migration Rate (%)', color='Total Population')
#highmig.reset_index(inplace=True)
#highmig_means.reset_index(inplace=True)
fig_mig = px.line(highmig, x='date', y='Migration Rate (%)', color = 'country', labels = {'date': 'Year'},
                  title = 'Top 10% Migration Countries, Rate Over Time', width=1200, height=600)

#px.line(highmig_means, x='date', y='Migration Rate (%)')

pio.write_image(fig_mig, 'migovertime.png',scale=4)
fig_mig

In [None]:
fig_avgmig = px.line(highmig_means, x='date', y='Migration Rate (%)', labels = {'date': 'Year'}, 
        title = 'Average Migration Rate for Top 10%', width=1000, height=600)

pio.write_image(fig_avgmig, 'avgmig_overtime.png',scale=4)
fig_avgmig

In [None]:
column = 'People using safely managed sanitation services (% of population)'
fig = px.scatter(post_2000, x="People using safely managed sanitation services (% of population)", y="Migration Rate (%)", color="date",
                 title="Migration Rate versus People using safely managed sanitation services (% of population)",range_y=[0,4])

fig.add_vline(x=post_2000[column].dropna().astype(int).mean(), line_dash="dot", line_color="red", line_width = 2, 
                      annotation_text="World Average")

pio.write_image(fig, str(column)+'.png',scale=4)

In [None]:
column1 = 'Prevalence of Undernourishment (% of Population)'
fig = px.scatter(post_2000, x="Prevalence of Undernourishment (% of Population)", y="Migration Rate (%)", 
                 color="date",title="Prevalence of Undernourishment (% of Population)",range_y=[0,4])
fig.add_vline(x=post_2000[column1].dropna().astype(int).mean(), line_dash="dot", line_color="red", line_width = 2, 
                      annotation_text="World Average")
pio.write_image(fig, str(column1)+'.png',scale=4)

In [None]:
px.scatter(post_2000, x='Food Production Index', y="Migration Rate (%)", color="date",
                 title='Food Production Index',range_y=[0,4], range_x=[0,300])
pio.write_image(fig, 'FoodProdInd.png',scale=4)

In [None]:
column3 = 'Strength of Legal Rights Index (0-12 Scale)'

fig = px.scatter(post_2000, x='Strength of Legal Rights Index (0-12 Scale)', y="Migration Rate (%)", color="date",
                 title='Strength of Legal Rights Index (0-12 Scale)',range_y=[0,4])
pio.write_image(fig, str(column3)+'.png',scale=4)

In [None]:
column4 = 'Hospital beds (per 1,000 people)'

fig = px.scatter(post_2000, x='Hospital beds (per 1,000 people)', y="Migration Rate (%)", color="date",
                 title='Hospital beds (per 1,000 people)',range_y=[0,4.5])

fig.add_vline(x=post_2000[column4].dropna().astype(int).mean(), line_dash="dot", line_color="red", line_width = 2, 
                      annotation_text="World Average")
pio.write_image(fig, str(column4)+'.png',scale=4)

In [None]:
column5 = 'Political Stability and Absence of Violence/Terrorism (estimate)'

fig = px.scatter(post_2000, x='Political Stability and Absence of Violence/Terrorism (estimate)', y="Migration Rate (%)", color="date",
                 title='Political Stability vs. Migration Rates',range_y=[0,4.5])

fig.add_vline(x=post_2000[column5].dropna().astype(int).mean(), line_dash="dot", line_color="red", line_width = 3, 
                      annotation_text="World Average")

pio.write_image(fig, 'pol_stab.png',scale=4)

In [None]:
column6 = 'Land Surface Temperature'

fig = px.scatter(post_2000, x='Land Surface Temperature', y="Migration Rate (%)", color="date", 
                 title='Migration Rate vs. Land Surface Temperature',range_y=[0,4.5], labels={
                     "Land Surface Temperature": "Land Surface Temperature [Degrees Celcius]"})
pio.write_image(fig, str(column6)+'.png',scale=4)

In [None]:
column7 = 'Income Share Held by Lowest 20%'

fig = px.scatter(post_2000, x='Income Share Held by Lowest 20%', y="Migration Rate (%)", color="date",
                 title='Migration Rate vs. Income Share Held by Lowest 20%',range_y=[0,4.5], 
                labels={
                     "Income Share Held by Lowest 20%": "Income Share Held by Lowest 20 Percent [% of GDP]"})

#fig.add_vline(x=post_2000[column7].dropna().astype(int).mean(), line_dash="dot", line_color="red", line_width = 3, 
 #                     annotation_text="World Average")
import plotly.io as pio
pio.write_image(fig, str(column7)+'.png',scale=4)

In [None]:
column8 = 'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)'

fig = px.scatter(post_2000, x='PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)', y="Migration Rate (%)", color="date",
                 title='Migration Rate vs. PM2.5 air pollution',range_y=[0,4.5])

fig.add_vline(x=post_2000[column8].dropna().astype(int).mean(), line_dash="solid", line_color="red", line_width = 2, 
                      annotation_text="World Average")

pio.write_image(fig, str(column8)+'.png',scale=4)

In [None]:
fig_dgdays = px.scatter(post_2000, x='Cooling Degree Days', y="Migration Rate (%)", color="date",
                 title='Migration Rate vs. Cooling Degree Days',range_y=[0,4.5])
fig_dgdays.add_vline(x=3158.95725388601, line_dash="solid", line_color="red", line_width = 2, annotation_text="World Average")
print('World Average = ' + str((post_2000['Cooling Degree Days'].dropna().astype(int).mean())))
pio.write_image(fig_dgdays, 'coolingdegdays.png',scale=4)

In [None]:
#post_2000_avgd.dropna()
post_2000.iplot(kind='scatter', mode='markers', symbol='circle-dot', 
         x='Food Production Index',y="Migration Rate (%)", #bestfit=True, bestfit_colors=['red'],
         text=upper60_2000.reset_index('country')['country'].values.tolist(),
         yTitle="Migration Rate (%)",xTitle='Food Production Index',
         title=  'Migration Rate (%) versus Food Production Index', xrange = [0,350], yrange=[0,4])

#### ones that prove something: 
could be good to do a histogram or two 

- Food Production Index:
    - centered on the lower end, very much clustered 
- Prevalence of Undernourishment
- For the water access ones: 
    - should grab a dataframe with maybe upper 50th percentile of migration and then do these plots 
- Hospital beds per 1000 people
- Strength of legal rights index 
- Political stability and absence of violence/terrorism (skewed to negative side) 
- access to electricity 
- control of population 

- LAND SURFACE TEMPERATURE!!!!
- PM2.5 AIR POLLUTION!!!!
- COOLING DEGREE DAYS!!!!
- head index
- 

To do with dataframe with maybe upper 50th percentile of migration:
- the water access ones
- Strength of legal rights index
- access to electricity 

In [None]:
upper50_2000 = post_2000[post_2000['Percentile Rank'] > 50]

In [None]:
column_hos = 'Hospital beds (per 1,000 people)'

fig = px.scatter(upper50_2000, x='Hospital beds (per 1,000 people)', y="Migration Rate (%)", color="date",
                 title='Hospital beds (per 1,000 people)',range_y=[0,4.5])

fig.add_vline(x=upper50_2000[column_hos].dropna().astype(int).mean(), line_dash="solid", line_color="red", line_width = 2, 
                      annotation_text="World Average")

In [None]:
'Annual freshwater withdrawals, total (% of internal resources)',
       'People using safely managed drinking water services (% of population)',
       'People using safely managed sanitation services (% of population)',

In [None]:
column_water = 'Annual freshwater withdrawals, total (% of internal resources)'
column_drink = 'People using safely managed drinking water services (% of population)'
column_sanit = 'People using safely managed sanitation services (% of population)'

In [None]:
fig2 = px.scatter(upper50_2000, x=column_drink, y="Migration Rate (%)", color="date", size_max =5, 
                 title='Migration Rate vs. '+column_drink,range_y=[0,4.5])

fig3 = px.scatter(upper50_2000, x=column_sanit, y="Migration Rate (%)", color="date",
                 title='Migration Rate vs. '+column_sanit,range_y=[0,4.5])

In [None]:
fig2

In [None]:
fig3

In [None]:
def hist_all_2000(frame):
    columns = frame.columns
    for column in columns: 
        frame[column].iplot(kind='histogram', 
         y='Migration Rate(%)',x= str(column), bins = 40, #bestfit=True, bestfit_colors=['red'],
         #text=frame.reset_index('country')['country'].values.tolist(),
         yTitle="Migration Rate (%)",xTitle=str(column),
         xrange=(frame[column].min(), frame[column].max()), 
         title= str(column)+' versus Migration Rate (%)')
        
        
        
#df['values'].iplot(kind='histogram', bins=5, title='Histogram of Values')

In [None]:
post_2000_avgd['Migration Rate (%)'].describe()

In [None]:
hist_all_2000(post_2000_avgd)


In [None]:
hist_all_2000(upper50)