In [None]:
# dask package installation and import of the libraries that will be used in the project
!python -m pip install "dask[dataframe]"

import pandas as pd
import numpy as np
import dask.dataframe as dd
import shutil

In [2]:
# selection of columns to be used
cols = ['CBO Ocupação 2002',    # profession code
        'Faixa Etária',         # age group        
        'Município',            # city
        'Vl Remun Média Nom',   # salary
        'Sexo Trabalhador']     # gender

In [3]:
# reading the files in a dask dataframe
df = dd.read_csv('rais_raw_data/RAIS_VINC*',sep = ";", encoding= "ISO-8859-1", usecols = cols, low_memory=False, 
                 dtype={'CBO Ocupação 2002': 'object','Faixa Etária': 'object'})

In [4]:
# removal of lines that are not of interest to the project and consequent reduction of the dataset
df = df[df['CBO Ocupação 2002'].str.startswith('2')]

In [None]:
# transforming a dask dataframe to a pandas dataframe
df = df.compute()

In [None]:
# checking the number of rows and columns of the dataframe
df.shape

In [None]:
df.head()

In [None]:
# loading project supplementary data into pandas dataframes
cbo = pd.read_csv('supplementary_data/cbo.csv')
state = pd.read_csv('supplementary_data/states.csv')

In [None]:
cbo.head()

In [None]:
state.head()

In [None]:
# join of the original dataframe with the dataframe with the names of the professions
data = pd.merge(df, cbo, on='CBO Ocupação 2002', how ='left')

In [None]:
data.head()

In [None]:
# removal of lines without the name of a profession
data = data[data['Profession'].notnull()]

In [None]:
data.shape

In [None]:
# join of previous dataframe with dataframe with states
data = pd.merge(data,state,on ='Município', how ='left')

In [None]:
# removal of columns with the CBO code and the municipality
data.drop(['CBO Ocupação 2002','Município', "Profession"], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data['Vl Remun Média Nom']=data['Vl Remun Média Nom'].str.replace(',','.')

In [None]:
# transformation of variables with salary values from "object" to "float64"
data["Vl Remun Média Nom"] = pd.to_numeric(data["Vl Remun Média Nom"])

In [None]:
data.dtypes

In [None]:
# removing rows where salary equal 0
data = data[data['Vl Remun Média Nom'] > 0]

In [None]:
data.shape

In [None]:
# exchange the encoding of the column "Age Group" by the actual value of the variable "Age Group"
data['Faixa Etária'] = data['Faixa Etária'].map({'01': '10 to 14 years',
                                               '02': '15 to 17 years',
                                               '03': '18 to 24 years',
                                               '04': '25 to 29 years',
                                               '05': '30 to 39 years',
                                               '06': '40 to 49 years',
                                               '07': '50 to 64 years',
                                               '08': '65+ years'
                                              }
                                             )

In [None]:
# exchange the encoding of the "Worker Sex" column for the actual value of the "Worker Sex" variable
data['Sexo Trabalhador'] = data['Sexo Trabalhador'].map({1: 'Male',
                                                         2: 'Female',
                                                        -1: 'uninformed'
                                                        }
                                                       )

In [None]:
# renaming columnS names
data.rename(columns={'Faixa Etária': "age",
                     'Vl Remun Média Nom' : 'salary',
                     'Sexo Trabalhador': 'gender',
                     'State': 'state'
                     }, 
            inplace=True)

In [None]:
# removal of lines with probably incorrect padding
data.drop(data.loc[data['age']=='15 to 17 years'].index, inplace=True)
data.drop(data.loc[data['age']=='10 to 14 years'].index, inplace=True)

In [None]:
# resetting the index
data.reset_index(drop=True, inplace=True)

In [None]:
data.head()

In [None]:
from sqlalchemy import create_engine

In [None]:
# creating the connection with sqlite to create the database
engine = create_engine('sqlite:///database.db', echo=True)
sqlite_connection = engine.connect()

In [None]:
# creating the variable that contains the database table name (salarys)
sqlite_table = "salarys"

In [None]:
# inserting data from dataframe to database
data.to_sql(sqlite_table, sqlite_connection, if_exists='fail', index=True, index_label="id")

In [None]:
# running a query to check the first 5 elements of the table "salarys" from the database
engine.execute("SELECT * FROM salarys limit 5").fetchall()

In [None]:
# running a query to check the table columns
records = engine.execute("PRAGMA table_info(salarys)").fetchall()  # fetches the 6 rows of data
print(records)
for row in records:
    print("Columns: ", row[1])

In [None]:
# running a query to check the total table entries
engine.execute("SELECT count(id) FROM salarys").fetchall()