In [14]:
#importações necessárias

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


%matplotlib inline

import statsmodels.api as sm
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

# Para ter melhor print
from IPython.display import display

In [15]:
dados = pd.read_csv("pnad23_filtrado.csv")
dados.head(5)

Unnamed: 0,Estado,Sexo,Idade,Cor_ou_Raça,Escolaridade,Ocupação,Faixa_de_rendimento,Rendimento_Bruto
0,Rondônia,Homem,30,Parda,Regular do ensino médio ou do 2º grau,Conta Própria,[3SM]+1 a [5SM],5000.0
1,Rondônia,Mulher,50,Parda,Superior - graduação,Empregado do setor público (inclusive empresas...,[2SM]+1 a [3SM],3500.0
2,Rondônia,Mulher,54,Branca,Superior - graduação,Empregado do setor público (inclusive empresas...,[2SM]+1 a [3SM],3700.0
3,Rondônia,Homem,36,Parda,Regular do ensino fundamental ou do 1º grau,Conta Própria,[2SM]+1 a [3SM],3300.0
4,Rondônia,Homem,33,Branca,Regular do ensino médio ou do 2º grau,Conta Própria,[2SM]+1 a [3SM],3500.0


## O projeto
Com o objetivo de prever o salário mais provável de uma pessoa, baseando-se nas suas características socioeconômicas, este projeto utiliza os microdados da PNAD Contínua.
Para construir um modelo preditivo, foi feito a escolha da nossa Target, sendo esta o rendimento bruto, e a seleção das variáveis explicativas , sendo estas escolaridade, idade, gênero, localização geográfica, ocupação, cor/raça e faixa de rendimento. 
A análise exploratória dos dados foi feita no arquivo Jupyter [analise_exploratória.iypnb](analise_exploratoria.ipynb)

Para atingir o objetivo foi necessário aplicar dois modelos preditivos, cujas precisões e resultados foram comparados a fim de decidir qual o modelo mais adequado para o porblema.




## Separando a base de treino e teste 

In [16]:
dados.isnull().sum()

Estado                 0
Sexo                   0
Idade                  0
Cor_ou_Raça            0
Escolaridade           0
Ocupação               0
Faixa_de_rendimento    0
Rendimento_Bruto       0
dtype: int64

In [17]:
dados.dtypes

Estado                  object
Sexo                    object
Idade                    int64
Cor_ou_Raça             object
Escolaridade            object
Ocupação                object
Faixa_de_rendimento     object
Rendimento_Bruto       float64
dtype: object

In [18]:
x_colunas = dados[['Estado', 'Sexo', 'Idade', 'Cor_ou_Raça', 'Escolaridade', 'Ocupação','Faixa_de_rendimento']]

y_colunas = dados[['Rendimento_Bruto']]


### Para separar as bases em treino e teste é necessário aplicar o get_dummies(), para as variáveis categoricas poderem ser utilizadas no modelo

In [19]:
x_dummies = pd.get_dummies(x_colunas)
x_dummies = x_dummies.astype(int)
x_dummies.dtypes

Idade                                   int32
Estado_Acre                             int32
Estado_Alagoas                          int32
Estado_Amapá                            int32
Estado_Amazonas                         int32
                                        ...  
Faixa_de_rendimento_[1SM]+1 a [2SM]     int32
Faixa_de_rendimento_[20SM]+1 ou mais    int32
Faixa_de_rendimento_[2SM]+1 a [3SM]     int32
Faixa_de_rendimento_[3SM]+1 a [5SM]     int32
Faixa_de_rendimento_[5SM]+1 a [10SM]    int32
Length: 65, dtype: object

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x_dummies ,y_colunas , test_size=0.33, random_state= 42 )

In [21]:
x_train

Unnamed: 0,Idade,Estado_Acre,Estado_Alagoas,Estado_Amapá,Estado_Amazonas,Estado_Bahia,Estado_Ceará,Estado_Distrito Federal,Estado_Espirito Santo,Estado_Goiás,...,"Ocupação_Militar do exército, da marinha, da aeronáutica, da polícia militar ou do corpo de bombeiros militar",Ocupação_Trabalhador doméstico,"Faixa_de_rendimento_1 a [0,5SM]","Faixa_de_rendimento_[0,5SM]+1 a [1SM]",Faixa_de_rendimento_[10SM]+1 a [20SM],Faixa_de_rendimento_[1SM]+1 a [2SM],Faixa_de_rendimento_[20SM]+1 ou mais,Faixa_de_rendimento_[2SM]+1 a [3SM],Faixa_de_rendimento_[3SM]+1 a [5SM],Faixa_de_rendimento_[5SM]+1 a [10SM]
9255,20,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
83381,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
103097,48,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
139202,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103004,53,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,31,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103694,42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
131932,32,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
146867,51,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Aplicando o modelo Random Forest

In [22]:
random_model = RandomForestRegressor(n_estimators=100, random_state = 42 )

### Ensinando o modelo, para que consiga encontrar os padrões

In [23]:
random_model.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


### Nas celulas abaixo utilizamos as métricas MAE(Erro médio absoluto), MSE(Erro médio quadrático) e $R^2$(Coeficiente de determinação que mede quanto da variância dos modelos reais é explicada pelo modelo)

In [24]:
y_prev_train = random_model.predict(x_train)
mae = mean_absolute_error(y_train, y_prev_train)
mse = mean_squared_error(y_train, y_prev_train)
r2 = r2_score(y_train, y_prev_train)
mae, mse, r2

(210.69812439007617, 463390.5781934575, 0.971475393335519)

O MAE indica que o modelo errou, em média, 210.69 unidades nas previsões. Um erro baixo, indicando uma boa performance nos dados de treino.

O valor do MSE é 46390.57. Esse número é maior devido à penalização de erros maiores.

O valor de $R^2$ foi de 0.97, indicando que o modelo está ajustado para a base de treino, mas o bom desempenho não garante que seja bom com os dados de teste.

In [25]:
y_prev = random_model.predict(x_test)
mse = mean_squared_error(y_test, y_prev)
r2 = r2_score(y_test, y_prev)
mae = mean_absolute_error(y_test, y_prev)
mae, mse, r2 

(434.50358136448807, 3465863.87181802, 0.7714871476759115)

O MAE indica que o modelo errou, em média, 434.50 unidades nas previsões. Valor maior do que nos dados de treino. Indicando dificuldade em generalizar.

O valor do MSE é 3465863.87. Mostrando um aumento considerável nos erros grandes.

O valor de $R^2$ foi de 0.77, mostrando que ainda explica boa parte da variância nos dados de teste, mas com desempenho inferior do que com os dados de treino.

O modelo se saiu muito bem com a base de treino, porém seu desempenho caiu quando foi  utilizada a base de teste, sugerindo um pouco de overfitting.
Overfitting é justamente quando um modelo se sai muito bem com os dados de treino, porém seu desempenho cai com os dados de treino.

___
<div id='regressao'></div>

## Modelo de regressão


Nessa etapa será aplicado à base de dados o mesmo modelo de regressão linear visto na <strong>AULA 28</strong>

In [26]:
# Função utilitária para fazer a regressão 
# com constante adicionada

def regress(Y,X):
    '''
    Y: coluna do DataFrame utilizada como variável resposta (TARGET)
    X: coluna(s) do DataFrame utilizadas como variável(is) explicativas (FEATURES)
    '''
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp)
    results = model.fit()
    
    return results

A seguir serão removidas as colunas da base de dados `x_train` que possuirem valor-p menor do que o nível de significância ($\alpha$) estabelecido pelo grupo como 5%

In [27]:
regressao = regress(y_train, x_train)
regressao.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8389.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:30,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117311,BIC:,2089000.0
Df Model:,59,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4807.0563,60.660,79.247,0.000,4688.165,4925.948
Idade,1.2426,0.469,2.650,0.008,0.323,2.162
Estado_Acre,221.7339,41.000,5.408,0.000,141.375,302.093
Estado_Alagoas,161.2207,28.457,5.665,0.000,105.446,216.995
Estado_Amapá,240.6047,52.879,4.550,0.000,136.964,344.246
Estado_Amazonas,224.6591,31.548,7.121,0.000,162.826,286.493
Estado_Bahia,153.2930,25.625,5.982,0.000,103.069,203.517
Estado_Ceará,146.0079,25.990,5.618,0.000,95.069,196.947
Estado_Distrito Federal,195.7662,34.449,5.683,0.000,128.247,263.285

0,1,2,3
Omnibus:,396870.444,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171471164964.83
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5923.078,Cond. No.,3.17e+16


A seguir serão criados novos dataframes nos quais, a cada uso da função `regress()` será removida a coluna que possuir maior valor na coluna `P>|t|`.<br>
Nesse caso, será a coluna: `Escolaridade_Creche (disponível apenas no questionário anual de educação)`

In [28]:
x_train2 = x_train.drop(columns=["Escolaridade_Creche (disponível apenas no questionário anual de educação)"])

O processo será repetido diversas vezes até que todos os valores da coluna sejam menores do que 5%

In [29]:
regressao2 = regress(y_train, x_train2)
regressao2.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8389.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:31,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117311,BIC:,2089000.0
Df Model:,59,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4975.1208,445.698,11.163,0.000,4101.560,5848.682
Idade,1.2426,0.469,2.650,0.008,0.323,2.162
Estado_Acre,227.9585,43.980,5.183,0.000,141.758,314.159
Estado_Alagoas,167.4453,32.848,5.098,0.000,103.064,231.826
Estado_Amapá,246.8293,55.361,4.459,0.000,138.322,355.336
Estado_Amazonas,230.8837,35.557,6.493,0.000,161.192,300.576
Estado_Bahia,159.5176,30.438,5.241,0.000,99.859,219.177
Estado_Ceará,152.2325,30.649,4.967,0.000,92.161,212.304
Estado_Distrito Federal,201.9908,38.145,5.295,0.000,127.227,276.755

0,1,2,3
Omnibus:,396870.444,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171471164964.83
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5923.078,Cond. No.,1.19e+16


In [30]:
x_train3 = x_train2.drop(columns=['Escolaridade_Superior - graduação'])
regressao3 = regress(y_train, x_train3)
regressao3.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8533.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:32,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117312,BIC:,2089000.0
Df Model:,58,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4991.5914,54.472,91.636,0.000,4884.827,5098.356
Idade,1.2425,0.469,2.649,0.008,0.323,2.162
Estado_Acre,228.5515,40.995,5.575,0.000,148.202,308.901
Estado_Alagoas,168.0573,28.438,5.910,0.000,112.318,223.796
Estado_Amapá,247.4420,52.859,4.681,0.000,143.838,351.046
Estado_Amazonas,231.4962,31.524,7.344,0.000,169.710,293.282
Estado_Bahia,160.1303,25.607,6.253,0.000,109.942,210.319
Estado_Ceará,152.8382,25.975,5.884,0.000,101.927,203.749
Estado_Distrito Federal,202.6021,34.431,5.884,0.000,135.118,270.087

0,1,2,3
Omnibus:,396870.43,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171471114283.209
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5923.077,Cond. No.,2.04e+17


In [31]:
x_train4 = x_train3.drop(columns=['Escolaridade_Antigo científico, clássico, etc. (médio 2º ciclo)'])
regressao4 = regress(y_train, x_train4)
regressao4.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8683.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:32,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117313,BIC:,2089000.0
Df Model:,57,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4991.4857,54.469,91.640,0.000,4884.728,5098.243
Idade,1.2543,0.464,2.702,0.007,0.345,2.164
Estado_Acre,228.5358,40.995,5.575,0.000,148.187,308.884
Estado_Alagoas,168.0793,28.438,5.910,0.000,112.341,223.817
Estado_Amapá,247.3501,52.857,4.680,0.000,143.752,350.948
Estado_Amazonas,231.4609,31.523,7.343,0.000,169.676,293.245
Estado_Bahia,160.1229,25.607,6.253,0.000,109.934,210.311
Estado_Ceará,152.8351,25.975,5.884,0.000,101.924,203.746
Estado_Distrito Federal,202.6480,34.430,5.886,0.000,135.166,270.130

0,1,2,3
Omnibus:,396871.957,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171474245912.279
Skew:,61.374,Prob(JB):,0.0
Kurtosis:,5923.131,Cond. No.,1.22e+17


In [32]:
x_train5 = x_train4.drop(columns=['Escolaridade_Pré-escola'])
regressao5 = regress(y_train, x_train5)
regressao5.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8838.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:33,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117314,BIC:,2089000.0
Df Model:,56,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4991.4146,54.468,91.639,0.000,4884.657,5098.172
Idade,1.2345,0.464,2.663,0.008,0.326,2.143
Estado_Acre,228.6609,40.994,5.578,0.000,148.313,309.009
Estado_Alagoas,168.1846,28.438,5.914,0.000,112.447,223.922
Estado_Amapá,247.4498,52.856,4.682,0.000,143.852,351.048
Estado_Amazonas,231.5924,31.523,7.347,0.000,169.809,293.376
Estado_Bahia,159.8057,25.604,6.241,0.000,109.622,209.989
Estado_Ceará,152.9491,25.975,5.888,0.000,102.039,203.859
Estado_Distrito Federal,202.7131,34.430,5.888,0.000,135.231,270.195

0,1,2,3
Omnibus:,396868.973,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171463951805.49
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5922.954,Cond. No.,1.25e+17


In [33]:
x_train6 = x_train5.drop(columns=['Escolaridade_Mestrado'])
regressao6 = regress(y_train, x_train6)
regressao6.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8999.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:34,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117315,BIC:,2089000.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4993.3221,54.449,91.706,0.000,4886.602,5100.042
Idade,1.2377,0.464,2.669,0.008,0.329,2.146
Estado_Acre,228.3624,40.994,5.571,0.000,148.015,308.710
Estado_Alagoas,168.4507,28.437,5.924,0.000,112.714,224.187
Estado_Amapá,247.1351,52.856,4.676,0.000,143.538,350.732
Estado_Amazonas,231.8520,31.522,7.355,0.000,170.069,293.635
Estado_Bahia,160.1856,25.603,6.257,0.000,110.005,210.366
Estado_Ceará,153.3313,25.973,5.903,0.000,102.424,204.239
Estado_Distrito Federal,203.5570,34.424,5.913,0.000,136.086,271.028

0,1,2,3
Omnibus:,396852.795,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171405715435.619
Skew:,61.365,Prob(JB):,0.0
Kurtosis:,5921.948,Cond. No.,8.53e+16


In [34]:
x_train7 = x_train6.drop(columns=['Cor_ou_Raça_Ignorado'])
regressao7 = regress(y_train, x_train7)
regressao7.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8999.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:34,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117315,BIC:,2089000.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5418.1345,341.650,15.859,0.000,4748.505,6087.764
Idade,1.2377,0.464,2.669,0.008,0.329,2.146
Estado_Acre,244.0962,42.869,5.694,0.000,160.073,328.119
Estado_Alagoas,184.1845,31.072,5.928,0.000,123.284,245.084
Estado_Amapá,262.8689,54.326,4.839,0.000,156.390,369.348
Estado_Amazonas,247.5858,33.937,7.295,0.000,181.070,314.102
Estado_Bahia,175.9194,28.504,6.172,0.000,120.052,231.786
Estado_Ceará,169.0651,28.834,5.863,0.000,112.551,225.579
Estado_Distrito Federal,219.2907,36.432,6.019,0.000,147.884,290.698

0,1,2,3
Omnibus:,396852.795,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171405715435.619
Skew:,61.365,Prob(JB):,0.0
Kurtosis:,5921.948,Cond. No.,1.39e+16


In [35]:
x_train8 = x_train7.drop(columns=['Cor_ou_Raça_Amarela'])
regressao8 = regress(y_train, x_train8)
regressao8.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9165.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:35,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117316,BIC:,2089000.0
Df Model:,54,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5400.7010,37.002,145.958,0.000,5328.178,5473.224
Idade,1.2377,0.464,2.670,0.008,0.329,2.146
Estado_Acre,243.4459,40.954,5.944,0.000,163.176,323.716
Estado_Alagoas,183.5366,28.392,6.464,0.000,127.888,239.185
Estado_Amapá,262.2196,52.833,4.963,0.000,158.667,365.772
Estado_Amazonas,246.9380,31.503,7.838,0.000,185.192,308.684
Estado_Bahia,175.2719,25.560,6.857,0.000,125.174,225.370
Estado_Ceará,168.4182,25.934,6.494,0.000,117.589,219.248
Estado_Distrito Federal,218.6780,34.421,6.353,0.000,151.214,286.142

0,1,2,3
Omnibus:,396852.814,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171405790535.335
Skew:,61.365,Prob(JB):,0.0
Kurtosis:,5921.949,Cond. No.,2.85e+17


In [36]:
x_train9 = x_train8.drop(columns=['Cor_ou_Raça_Indigena'])
regressao9 = regress(y_train, x_train9)
regressao9.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9338.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:35,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117317,BIC:,2089000.0
Df Model:,53,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5417.0692,29.912,181.097,0.000,5358.441,5475.697
Idade,1.2364,0.464,2.667,0.008,0.328,2.145
Estado_Acre,243.8527,40.951,5.955,0.000,163.590,324.115
Estado_Alagoas,184.0742,28.383,6.485,0.000,128.444,239.705
Estado_Amapá,262.7496,52.828,4.974,0.000,159.207,366.292
Estado_Amazonas,248.7770,31.408,7.921,0.000,187.218,310.336
Estado_Bahia,175.8651,25.548,6.884,0.000,125.791,225.939
Estado_Ceará,169.0966,25.918,6.524,0.000,118.298,219.895
Estado_Distrito Federal,219.1920,34.414,6.369,0.000,151.741,286.643

0,1,2,3
Omnibus:,396853.084,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171406084120.055
Skew:,61.366,Prob(JB):,0.0
Kurtosis:,5921.954,Cond. No.,2.94e+17


In [37]:
x_train10 = x_train9.drop(columns=['Escolaridade_Alfabetização de jovens e adultos'])
regressao9 = regress(y_train, x_train10)
regressao9.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9518.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:36,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117318,BIC:,2089000.0
Df Model:,52,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5416.8709,29.912,181.091,0.000,5358.243,5475.499
Idade,1.1925,0.463,2.577,0.010,0.286,2.099
Estado_Acre,243.7344,40.951,5.952,0.000,163.472,323.997
Estado_Alagoas,184.0403,28.383,6.484,0.000,128.410,239.671
Estado_Amapá,263.4539,52.827,4.987,0.000,159.915,366.993
Estado_Amazonas,249.1572,31.407,7.933,0.000,187.599,310.715
Estado_Bahia,175.6123,25.548,6.874,0.000,125.539,225.686
Estado_Ceará,169.0555,25.918,6.523,0.000,118.257,219.854
Estado_Distrito Federal,219.4603,34.414,6.377,0.000,152.010,286.911

0,1,2,3
Omnibus:,396845.991,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171381792703.639
Skew:,61.362,Prob(JB):,0.0
Kurtosis:,5921.535,Cond. No.,2.95e+17


In [38]:
x_train11 = x_train10.drop(columns=['Cor_ou_Raça_Parda'])
regressao10 = regress(y_train, x_train11)
regressao10.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9704.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:37,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117319,BIC:,2089000.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5463.8472,15.196,359.567,0.000,5434.064,5493.630
Idade,1.1818,0.463,2.554,0.011,0.275,2.089
Estado_Acre,245.4136,40.941,5.994,0.000,165.170,325.657
Estado_Alagoas,185.9626,28.364,6.556,0.000,130.370,241.555
Estado_Amapá,265.6601,52.813,5.030,0.000,162.147,369.173
Estado_Amazonas,249.4182,31.407,7.941,0.000,187.861,310.976
Estado_Bahia,177.4579,25.528,6.952,0.000,127.424,227.492
Estado_Ceará,171.2651,25.890,6.615,0.000,120.521,222.009
Estado_Distrito Federal,221.5255,34.395,6.441,0.000,154.111,288.940

0,1,2,3
Omnibus:,396847.125,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171382653390.804
Skew:,61.363,Prob(JB):,0.0
Kurtosis:,5921.55,Cond. No.,2.78e+17


In [39]:
x_train12 = x_train11.drop(columns=['Cor_ou_Raça_Preta'])
regressao12 = regress(y_train, x_train12)
regressao12.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9898.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:37,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117320,BIC:,2089000.0
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5465.7180,15.122,361.440,0.000,5436.079,5495.357
Idade,1.1898,0.463,2.572,0.010,0.283,2.097
Estado_Acre,244.2887,40.931,5.968,0.000,164.065,324.513
Estado_Alagoas,185.3557,28.360,6.536,0.000,129.771,240.941
Estado_Amapá,265.4202,52.813,5.026,0.000,161.908,368.933
Estado_Amazonas,247.4172,31.367,7.888,0.000,185.939,308.895
Estado_Bahia,179.8237,25.458,7.064,0.000,129.926,229.721
Estado_Ceará,170.2628,25.878,6.580,0.000,119.543,220.983
Estado_Distrito Federal,222.4260,34.388,6.468,0.000,155.026,289.826

0,1,2,3
Omnibus:,396850.374,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171389338795.005
Skew:,61.364,Prob(JB):,0.0
Kurtosis:,5921.665,Cond. No.,2.95e+17


### Removendo a coluna de Raça

Após o processo de remoção das colunas, percebe-se que a coluna `Cor_ou_Raça` possui apenas um único valor (no caso, aqueles de raça branca). Tendo isso em mente, o grupo optou por remover a coluna `Cor_ou_Raça` por completo da base de dados.

In [40]:
x_train13 = x_train12.drop(columns=['Cor_ou_Raça_Branca'])
regressao12 = regress(y_train, x_train13)
regressao12.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,10100.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:05:38,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117321,BIC:,2089000.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5474.8072,14.913,367.106,0.000,5445.577,5504.037
Idade,1.2746,0.462,2.758,0.006,0.369,2.180
Estado_Acre,237.8496,40.895,5.816,0.000,157.697,318.002
Estado_Alagoas,183.3072,28.356,6.465,0.000,127.730,238.884
Estado_Amapá,257.0930,52.766,4.872,0.000,153.673,360.513
Estado_Amazonas,240.5786,31.311,7.683,0.000,179.209,301.949
Estado_Bahia,174.6336,25.419,6.870,0.000,124.813,224.455
Estado_Ceará,168.3028,25.873,6.505,0.000,117.591,219.014
Estado_Distrito Federal,221.5186,34.389,6.442,0.000,154.117,288.920

0,1,2,3
Omnibus:,396848.586,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171387673311.465
Skew:,61.364,Prob(JB):,0.0
Kurtosis:,5921.636,Cond. No.,2.92e+17


### Atualizando a base de dados de teste

Após a remoção das colunas da base de dados de treino, será feito o mesmo processo, porém com a base de dados de teste

In [76]:
def maior_valor_p(model):
    valores_p = model.pvalues.drop('const')
    return valores_p.idxmax(), valores_p.max()

removidos = []

# Loop para tirar os valores maiores que 0.05
while True:
    modelo = regress(y_test, x_test)
    coluna, valor = maior_valor_p(modelo)
    removidos.append(coluna)
    if valor < 0.05:
        break
    x_test = x_test.drop(columns=[coluna])

# Modelo final
modelo_final = regress(y_test, x_test)
modelo_final.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,12450.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,15:19:42,Log-Likelihood:,-505410.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57783,BIC:,1011000.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5450.7973,17.968,303.360,0.000,5415.580,5486.015
Idade,3.3262,0.569,5.845,0.000,2.211,4.441
Estado_Goiás,161.3284,33.259,4.851,0.000,96.141,226.516
Estado_Minas Gerais,72.9930,22.900,3.188,0.001,28.110,117.876
Estado_Rio Grande do Sul,61.3905,26.720,2.298,0.022,9.019,113.762
Estado_Santa Catarina,50.6423,24.297,2.084,0.037,3.021,98.264
Sexo_Homem,2762.3411,11.363,243.094,0.000,2740.069,2784.613
Sexo_Mulher,2688.4562,11.448,234.848,0.000,2666.019,2710.894
"Escolaridade_Antigo científico, clássico, etc. (médio 2º ciclo)",-226.5219,106.776,-2.121,0.034,-435.803,-17.240

0,1,2,3
Omnibus:,173073.728,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28511902883.598
Skew:,43.366,Prob(JB):,0.0
Kurtosis:,3442.372,Cond. No.,1.05e+17


In [71]:
print(removidos)

['Cor_ou_Raça_Ignorado', 'Escolaridade_Creche (disponível apenas no questionário anual de educação)', 'Escolaridade_Especialização de nível superior', 'Escolaridade_Doutorado', 'Cor_ou_Raça_Amarela', 'Cor_ou_Raça_Preta', 'Cor_ou_Raça_Indigena', 'Cor_ou_Raça_Parda', 'Escolaridade_Mestrado', 'Cor_ou_Raça_Branca', 'Escolaridade_Superior - graduação', 'Escolaridade_Pré-escola', 'Escolaridade_Alfabetização de jovens e adultos', 'Estado_Roraima', 'Estado_Distrito Federal', 'Estado_Piaui', 'Estado_Pará', 'Estado_Sergipe', 'Estado_Ceará', 'Estado_Rio Grande do Norte', 'Estado_Rondônia', 'Estado_Tocantins', 'Estado_Maranhão', 'Estado_Pernambuco', 'Estado_Bahia', 'Estado_Mato Grosso do Sul', 'Estado_Alagoas', 'Estado_Amazonas', 'Estado_Acre', 'Estado_Espirito Santo', 'Estado_Amapá', 'Estado_São Paulo', 'Estado_Rio de Janeiro', 'Estado_Paraná', 'Estado_Mato Grosso', 'Estado_Paraíba', 'Estado_Santa Catarina']
