# Libraries

In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

# Data

In [140]:
df = pd.read_csv("inequality data.csv")

## Tidying Up

For workability...

In [141]:
# Drop unecessary column
df.drop(columns = 'Unnamed: 0', inplace=True)

In [142]:
df.columns

Index(['Country Name', 'Country Code', 'Year', 'Continent',
       'World Regions (UN SDG Definition)',
       'Regime Type (RoW Measure Definition)',
       'Population, total - SP.POP.TOTL',
       'Access to electricity (% of population) - EG.ELC.ACCS.ZS',
       'GDP (current US$) - NY.GDP.MKTP.CD',
       'GDP per capita (current US$) - NY.GDP.PCAP.CD',
       'Renewable energy consumption (% of total final energy consumption) - EG.FEC.RNEW.ZS',
       'Renewable electricity output (% of total electricity output) - EG.ELC.RNEW.ZS',
       'Women Business and the Law Index Score (scale 1-100) - SG.LAW.INDX',
       'Proportion of seats held by women in national parliaments (%) - SG.GEN.PARL.ZS',
       'Income Classification (World Bank Definition)',
       'Individuals using the Internet (% of population) - IT.NET.USER.ZS',
       'power_soec', 'power_f', 'suffrage_f', 'dmr_f',
       'Gini index (World Bank estimate) - SI.POV.GINI'],
      dtype='object')

In [144]:
df.columns = df.columns.str.replace(r'([A-Z]){2}\.([A-Z]){3}\.([A-Z]){4}\.([A-Z])+', '', regex=True).str.replace(r'([A-Z]){2}\.([A-Z]){3}\.([A-Z]){4}', '', regex=True)

df.columns = df.columns.str.replace('(','').str.replace(')','').str.replace('%', '').str.replace('$', '').str.lower().str.replace(' - ', '').str.replace('  ', '_').str.replace(' ', '_').str.replace('.','_').str.replace(',','')

  df.columns = df.columns.str.replace('(','').str.replace(')','').str.replace('%', '').str.replace('$', '').str.lower().str.replace(' - ', '').str.replace('  ', '_').str.replace(' ', '_').str.replace('.','_').str.replace(',','')


In [145]:
# Check columns
df.columns

Index(['country_name', 'country_code', 'year', 'continent',
       'world_regions_un_sdg_definition', 'regime_type_row_measure_definition',
       'population_total', 'access_to_electricity_of_population',
       'gdp_current_us', 'gdp_per_capita_current_us',
       'renewable_energy_consumption_of_total_final_energy_consumption',
       'renewable_electricity_output_of_total_electricity_output',
       'women_business_and_the_law_index_score_scale_1-100',
       'proportion_of_seats_held_by_women_in_national_parliaments_',
       'income_classification_world_bank_definition',
       'individuals_using_the_internet_of_population', 'power_soec', 'power_f',
       'suffrage_f', 'dmr_f', 'gini_index_world_bank_estimate'],
      dtype='object')

## Actual Cleaning

Filling nulls mainly

In [146]:
# Check nulls in df
df.isnull().sum()

country_name                                                         0
country_code                                                         0
year                                                                 0
continent                                                            2
world_regions_un_sdg_definition                                      2
regime_type_row_measure_definition                                 211
population_total                                                   181
access_to_electricity_of_population                                 94
gdp_current_us                                                      41
gdp_per_capita_current_us                                           41
renewable_energy_consumption_of_total_final_energy_consumption       0
renewable_electricity_output_of_total_electricity_output           516
women_business_and_the_law_index_score_scale_1-100                  76
proportion_of_seats_held_by_women_in_national_parliaments_         225
income

In [147]:
# Check nulls in continent
df[df.continent.isnull()]

Unnamed: 0,country_name,country_code,year,continent,world_regions_un_sdg_definition,regime_type_row_measure_definition,population_total,access_to_electricity_of_population,gdp_current_us,gdp_per_capita_current_us,...,renewable_electricity_output_of_total_electricity_output,women_business_and_the_law_index_score_scale_1-100,proportion_of_seats_held_by_women_in_national_parliaments_,income_classification_world_bank_definition,individuals_using_the_internet_of_population,power_soec,power_f,suffrage_f,dmr_f,gini_index_world_bank_estimate
2945,Timor-Leste,TLS,2000,,,,,,367087900.0,415.085949,...,0.0,40.625,,,,-0.2,-0.009,0.0,1.49,
2946,Timor-Leste,TLS,2001,,,,,25.6,477457500.0,529.7937,...,0.0,40.625,,,,0.314,0.362,0.0,1.49,


In [148]:
# Check unique values in continent
df.continent.unique()

array(['North America', 'Africa', 'Europe', 'Asia', 'South America',
       'Oceania', nan], dtype=object)

In [149]:
# Fill nulls in continent
df.continent.fillna('Asia', inplace=True)

In [150]:
df[df.world_regions_un_sdg_definition.isnull()]

Unnamed: 0,country_name,country_code,year,continent,world_regions_un_sdg_definition,regime_type_row_measure_definition,population_total,access_to_electricity_of_population,gdp_current_us,gdp_per_capita_current_us,...,renewable_electricity_output_of_total_electricity_output,women_business_and_the_law_index_score_scale_1-100,proportion_of_seats_held_by_women_in_national_parliaments_,income_classification_world_bank_definition,individuals_using_the_internet_of_population,power_soec,power_f,suffrage_f,dmr_f,gini_index_world_bank_estimate
2945,Timor-Leste,TLS,2000,Asia,,,,,367087900.0,415.085949,...,0.0,40.625,,,,-0.2,-0.009,0.0,1.49,
2946,Timor-Leste,TLS,2001,Asia,,,,25.6,477457500.0,529.7937,...,0.0,40.625,,,,0.314,0.362,0.0,1.49,


In [151]:
# Check for correct region
df[df['country_name'] == 'Timor-Leste']['world_regions_un_sdg_definition']

2945                               NaN
2946                               NaN
2947    Eastern and South-Eastern Asia
2948    Eastern and South-Eastern Asia
2949    Eastern and South-Eastern Asia
2950    Eastern and South-Eastern Asia
2951    Eastern and South-Eastern Asia
2952    Eastern and South-Eastern Asia
2953    Eastern and South-Eastern Asia
2954    Eastern and South-Eastern Asia
2955    Eastern and South-Eastern Asia
2956    Eastern and South-Eastern Asia
2957    Eastern and South-Eastern Asia
2958    Eastern and South-Eastern Asia
2959    Eastern and South-Eastern Asia
2960    Eastern and South-Eastern Asia
2961    Eastern and South-Eastern Asia
2962    Eastern and South-Eastern Asia
2963    Eastern and South-Eastern Asia
Name: world_regions_un_sdg_definition, dtype: object

In [152]:
# Fill nulls in world region
df.world_regions_un_sdg_definition.fillna('Eastern and South-Eastern Asia', inplace=True)

In [153]:
# Check which countries have nulls
df[df.regime_type_row_measure_definition.isnull()]['country_name'].unique()

array(['Aruba', 'Antigua and Barbuda', 'Bahamas, The', 'Belize',
       'Brunei Darussalam', 'Dominica', 'St. Lucia', 'Macao SAR, China',
       'West Bank and Gaza', 'Timor-Leste', 'Tonga',
       'St. Vincent and the Grenadines'], dtype=object)

In [154]:
# Check unique values for this column
df.regime_type_row_measure_definition.unique()

array([nan, 'Closed Autocracy', 'Electoral Autocracy',
       'Electoral Democracy', 'Liberal Democracy'], dtype=object)

In [155]:
# Get index list for Electoral Democracy countries where regime is null
index = df.index

condition = (df['country_name'] == 'Antigua and Barbuda') | (df['country_name'] == 'Bahamas, The') | (df['country_name'] == 'Belize') | (df['country_name'] == 'Dominica') | (df['country_name'] == 'St. Lucia') | (df['country_name'] == 'St. Vincent and the Grenadines') & (df['regime_type_row_measure_definition'].isnull())

elecdem_regime = index[condition]

In [156]:
# Fill nulls in regime for with Electoral Democracy
df.loc[elecdem_regime, 'regime_type_row_measure_definition'] = df.loc[elecdem_regime, 'regime_type_row_measure_definition'].fillna('Electoral Democracy')

In [165]:
df[df['population_total'].isnull()][['country_code','population_total']]

Unnamed: 0,country_code,population_total
0,ABW,
19,AGO,
38,ALB,
57,ARE,
76,ARG,
...,...,...
3192,VNM,
3211,VUT,
3230,ZAF,
3249,ZMB,
