# Dataset Preprocessing

This notebook preprocesses the `world_development_indicators` data crawl, and creates CSV and SQLite inputs for the Pandas task.


In [None]:
import pandas as pd
import sqlite3


In [None]:
RAW_CSV_PATH = 'world_development_indicators/world_development_indicators_data.csv'
df = pd.read_csv(RAW_CSV_PATH)
df.head(5)


In [None]:
df.drop(columns=['Country Code', 'Series Code'], inplace=True, axis=1)
df = pd.pivot(df, index='Country Name', columns=[
              'Series Name'], values=['2020 [YR2020]'])
df.columns = [str(s2) for (_, s2) in df.columns.tolist()]
df = df.reset_index().rename(columns={'Country Name': 'country_name', 'GDP (current US$)': 'gdp_us_dollar',
                                      'Population, total': 'population_total', 'Urban population': 'population_urban',
                                      'Government expenditure on education, total (% of GDP)': 'government_expenditure_on_education_perc_of_gdp',
                                      'Labor force, total': 'labor_force_total', 'Life expectancy at birth, total (years)': 'life_expectancy_at_birth',
                                      'Exports of goods and services (% of GDP)': 'exports_perc_of_gdp', 'Imports of goods and services (% of GDP)': 'imports_perc_of_gdp'})
df.head()


In [None]:
df_social = df[['country_name', 'government_expenditure_on_education_perc_of_gdp',
                'life_expectancy_at_birth', 'population_total', 'population_urban']]
df_economics = df[['country_name', 'exports_perc_of_gdp', 'gdp_us_dollar',
                   'imports_perc_of_gdp', 'labor_force_total'
                   ]]


In [None]:
df_social.to_csv('social_indicators.csv')


In [None]:
con = sqlite3.connect('economic_indicators.db')
df_economics.to_sql(name='economic_statistics', con=con,
                    if_exists='replace', index=False)
