# Etapa 2: Extração e transformação para alcançar a massa de dados com indicadores relevantes no período de tempo entre 2010 a 2016 do dataset "EdStatsData.csv".

## 1. Importando libs

In [1]:
!pip3 install pandas
!pip3 install numpy



In [2]:
import pandas as pd
import numpy as np
import zipfile

## 2. Carregando o dataframe com os dados de todos os países e da faixa temporal.

In [3]:
zf = zipfile.ZipFile('../data/Edstats_csv.zip')
text_files = zf.infolist()
df_todos_paises = pd.DataFrame()
for csv_file in text_files:        
    if csv_file.filename == 'EdStatsData.csv':
        print("Abrindo o arquivo",csv_file.filename)
        df_todos_paises = pd.read_csv(zf.open(csv_file.filename))
        break

if df_todos_paises.empty:
    exit("Erro ao ler o arquivo 'EdStatsData.csv' que fica dentro do 'Edstats_csv.zip' na pasta 'data'.")
else:
    print("Arquivo 'EdStatsData.csv' carregado com sucesso.")

Abrindo o arquivo EdStatsData.csv
Arquivo 'EdStatsData.csv' carregado com sucesso.


## 3. Organizando nome de países definidos em "lista_suja_paises".

In [4]:
df_lista_suja_paises = pd.read_csv('../data/lista_suja_paises.csv', sep = ';')

df_lista_paises = df_lista_suja_paises.drop_duplicates()

df_lista_paises.index = range(df_lista_paises.shape[0])

df_lista_paises.to_csv('../data/lista_paises_aids.csv', index = False)

df_lista_paises.head()

Unnamed: 0,Country Name
0,Afghanistan
1,Albania
2,Algeria
3,American Samoa
4,Andorra


## 4. Extraindo e transformando dados referentes aos países definidos para "massa_bruta_pais_por_indicadores.csv".

### Retira a última coluna do dataset "EdStatsData.csv" para manter apenas os dados úteis.

In [5]:
df_todos_paises_lista_colunas = df_todos_paises.columns.tolist()

ultima_coluna_lixo_massa = df_todos_paises_lista_colunas[len(df_todos_paises_lista_colunas)-1]

df_todos_paises = df_todos_paises.drop([ultima_coluna_lixo_massa], axis=1)

df_todos_paises.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2055,2060,2065,2070,2075,2080,2085,2090,2095,2100
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,,,,,,...,,,,,,,,,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.822121,54.894138,56.209438,57.267109,57.991138,59.36554,...,,,,,,,,,,


### Seleciona apenas os registros dos países presentes em "lista_paises_aids.csv".

In [6]:
df_paises_filtrados = df_todos_paises.loc[df_todos_paises['Country Name'].isin(df_lista_paises['Country Name'])]

df_paises_filtrados.index = range(df_paises_filtrados.shape[0])

df_paises_filtrados.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2055,2060,2065,2070,2075,2080,2085,2090,2095,2100
0,Afghanistan,AFG,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,,,,7.05911,,...,,,,,,,,,,
1,Afghanistan,AFG,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,,,,2.53138,,...,,,,,,,,,,
2,Afghanistan,AFG,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,,,,0.22154,,...,,,,,,,,,,
3,Afghanistan,AFG,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,11.42652,,...,,,,,,,,,,
4,Afghanistan,AFG,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,,,,,,,...,,,,,,,,,,


### Escolhe os registros dos indicadores relevantes definidos em "codigos_indicadores_relevantes_aids.csv".

In [7]:
df_lista_indicadores = pd.read_csv('../data/codigos_indicadores_relevantes_aids.csv')

df_paises_filtrados = df_paises_filtrados.loc[df_paises_filtrados['Indicator Code'].isin(df_lista_indicadores['Indicator Code'])]

df_paises_filtrados.index = range(df_paises_filtrados.shape[0])

df_paises_filtrados.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2055,2060,2065,2070,2075,2080,2085,2090,2095,2100
0,Afghanistan,AFG,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,,,,,,,...,,,,,,,,,,
1,Afghanistan,AFG,GDP at market prices (current US$),NY.GDP.MKTP.CD,1748887000.0,1831109000.0,1595555000.0,1733333000.0,2155555000.0,2366667000.0,...,,,,,,,,,,
2,Afghanistan,AFG,Government expenditure on education as % of GD...,SE.XPD.TOTL.GD.ZS,,1.16036,1.11718,1.42788,,1.30332,...,,,,,,,,,,
3,Afghanistan,AFG,"Illiterate population, 25-64 years, both sexes...",UIS.ILLPOP.AG25T64,,,,,,,...,,,,,,,,,,
4,Afghanistan,AFG,Internet users (per 100 people),IT.NET.USER.P2,,,,,,,...,,,,,,,,,,


### Recupera os anos a serem utilizados para selecionar a faixa temporal da massa final.

In [8]:
df_lista_anos = pd.read_csv('../data/lista_anos_aids.csv')

menor_ano = df_lista_anos.min()
print(menor_ano)

maior_ano = df_lista_anos.max()
print(maior_ano)

Year    2010
dtype: int64
Year    2016
dtype: int64


### Aplica as restrições selecionadas anteriormente no dataset original e gera um conjunto de dados derivado para ser exportado em "massa_bruta_pais_por_indicadores_aids.csv".

In [9]:
for c in df_paises_filtrados.columns:
    if(df_paises_filtrados[c].dtype == np.float64):
        coluna_numerica = pd.to_numeric(df_paises_filtrados[c].name, errors='coerce')
        
        if(coluna_numerica < menor_ano[0] or coluna_numerica > maior_ano[0]):            
            del df_paises_filtrados[str(coluna_numerica)]

df_paises_filtrados.index = range(df_paises_filtrados.shape[0])

del df_paises_filtrados['Country Code']
del df_paises_filtrados['Indicator Name']

df_paises_filtrados = df_paises_filtrados.fillna(0)

df_paises_filtrados = df_paises_filtrados.round({'2010':2, '2011':2, '2012':2, '2013':2, '2014':2, '2015':2, '2016':2})

df_paises_filtrados.to_csv('../data/massa_bruta_pais_por_indicadores_aids.csv', index = False)

df_paises_filtrados.head()

Unnamed: 0,Country Name,Indicator Code,2010,2011,2012,2013,2014,2015,2016
0,Afghanistan,SE.PRM.TENR,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,NY.GDP.MKTP.CD,15936800000.0,17930240000.0,20536540000.0,20046330000.0,20050190000.0,19215560000.0,19469020000.0
2,Afghanistan,SE.XPD.TOTL.GD.ZS,3.46,3.44,2.53,3.48,3.78,3.32,0.0
3,Afghanistan,UIS.ILLPOP.AG25T64,0.0,6844552.0,0.0,0.0,0.0,0.0,0.0
4,Afghanistan,IT.NET.USER.P2,4.0,5.0,5.45,5.9,7.0,8.26,10.6
