In [2]:
#importações necessárias

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


%matplotlib inline

import statsmodels.api as sm
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

# Para ter melhor print
from IPython.display import display

In [3]:
dados = pd.read_csv("pnad23_filtrado.csv")
dados.head(5)

Unnamed: 0,Estado,Sexo,Idade,Cor_ou_Raça,Escolaridade,Ocupação,Faixa_de_rendimento,Rendimento_Bruto
0,Rondônia,Homem,30,Parda,Regular do ensino médio ou do 2º grau,Conta Própria,[3SM]+1 a [5SM],5000.0
1,Rondônia,Mulher,50,Parda,Superior - graduação,Empregado do setor público (inclusive empresas...,[2SM]+1 a [3SM],3500.0
2,Rondônia,Mulher,54,Branca,Superior - graduação,Empregado do setor público (inclusive empresas...,[2SM]+1 a [3SM],3700.0
3,Rondônia,Homem,36,Parda,Regular do ensino fundamental ou do 1º grau,Conta Própria,[2SM]+1 a [3SM],3300.0
4,Rondônia,Homem,33,Branca,Regular do ensino médio ou do 2º grau,Conta Própria,[2SM]+1 a [3SM],3500.0


## O projeto
Com o objetivo de prever o salário mais provável de uma pessoa, baseando-se nas suas características socioeconômicas, este projeto utiliza os microdados da PNAD Contínua.
Para construir um modelo preditivo, foi feito a escolha da nossa Target, sendo esta o rendimento bruto, e a seleção das variáveis explicativas , sendo estas escolaridade, idade, gênero, localização geográfica, ocupação, cor/raça e faixa de rendimento. 
A análise exploratória dos dados foi feita no arquivo Jupyter [analise_exploratória.iypnb](analise_exploratoria.ipynb)

Para atingir o objetivo foi necessário aplicar dois modelos preditivos, cujas precisões e resultados foram comparados a fim de decidir qual o modelo mais adequado para o porblema.




## Separando a base de treino e teste 

In [4]:
dados.isnull().sum()

Estado                 0
Sexo                   0
Idade                  0
Cor_ou_Raça            0
Escolaridade           0
Ocupação               0
Faixa_de_rendimento    0
Rendimento_Bruto       0
dtype: int64

In [5]:
dados.dtypes

Estado                  object
Sexo                    object
Idade                    int64
Cor_ou_Raça             object
Escolaridade            object
Ocupação                object
Faixa_de_rendimento     object
Rendimento_Bruto       float64
dtype: object

In [6]:
x_colunas = dados[['Estado', 'Sexo', 'Idade', 'Cor_ou_Raça', 'Escolaridade', 'Ocupação','Faixa_de_rendimento']]

y_colunas = dados[['Rendimento_Bruto']]


### Para separar as bases em treino e teste é necessário aplicar o get_dummies(), para as variáveis categoricas poderem ser utilizadas no modelo

In [7]:
x_dummies = pd.get_dummies(x_colunas)
x_dummies = x_dummies.astype(int)
x_dummies.dtypes

Idade                                   int32
Estado_Acre                             int32
Estado_Alagoas                          int32
Estado_Amapá                            int32
Estado_Amazonas                         int32
                                        ...  
Faixa_de_rendimento_[1SM]+1 a [2SM]     int32
Faixa_de_rendimento_[20SM]+1 ou mais    int32
Faixa_de_rendimento_[2SM]+1 a [3SM]     int32
Faixa_de_rendimento_[3SM]+1 a [5SM]     int32
Faixa_de_rendimento_[5SM]+1 a [10SM]    int32
Length: 65, dtype: object

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x_dummies ,y_colunas , test_size=0.33, random_state= 42 )

In [15]:
x_train

Unnamed: 0,Idade,Estado_Acre,Estado_Alagoas,Estado_Amapá,Estado_Amazonas,Estado_Bahia,Estado_Ceará,Estado_Distrito Federal,Estado_Espirito Santo,Estado_Goiás,...,"Ocupação_Militar do exército, da marinha, da aeronáutica, da polícia militar ou do corpo de bombeiros militar",Ocupação_Trabalhador doméstico,"Faixa_de_rendimento_1 a [0,5SM]","Faixa_de_rendimento_[0,5SM]+1 a [1SM]",Faixa_de_rendimento_[10SM]+1 a [20SM],Faixa_de_rendimento_[1SM]+1 a [2SM],Faixa_de_rendimento_[20SM]+1 ou mais,Faixa_de_rendimento_[2SM]+1 a [3SM],Faixa_de_rendimento_[3SM]+1 a [5SM],Faixa_de_rendimento_[5SM]+1 a [10SM]
9255,20,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
83381,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
103097,48,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
139202,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103004,53,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,31,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103694,42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
131932,32,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
146867,51,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Aplicando o modelo Random Forest

In [10]:
random_model = RandomForestRegressor(n_estimators=100, random_state = 42 )

### Ensinando o modelo, para que consiga encontrar os padrões

In [11]:
random_model.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


KeyboardInterrupt: 

### Nas celulas abaixo utilizamos as métricas MAE(Erro médio absoluto), MSE(Erro médio quadrático) e $R^2$(Coeficiente de determinação que mede quanto da variância dos modelos reais é explicada pelo modelo)

In [None]:
y_prev_train = random_model.predict(x_train)
mae = mean_absolute_error(y_train, y_prev_train)
mse = mean_squared_error(y_train, y_prev_train)
r2 = r2_score(y_train, y_prev_train)
mae, mse, r2

(210.69812439007617, 463390.5781934575, 0.971475393335519)

O MAE indica que o modelo errou, em média, 210.69 unidades nas previsões. Um erro baixo, indicando uma boa performance nos dados de treino.

O valor do MSE é 46390.57. Esse número é maior devido à penalização de erros maiores.

O valor de $R^2$ foi de 0.97, indicando que o modelo está ajustado para a base de treino, mas o bom desempenho não garante que seja bom com os dados de teste.

In [None]:
y_prev = random_model.predict(x_test)
mse = mean_squared_error(y_test, y_prev)
r2 = r2_score(y_test, y_prev)
mae = mean_absolute_error(y_test, y_prev)
mae, mse, r2 

(434.50358136448807, 3465863.87181802, 0.7714871476759115)

O MAE indica que o modelo errou, em média, 434.50 unidades nas previsões. Valor maior do que nos dados de treino. Indicando dificuldade em generalizar.

O valor do MSE é 3465863.87. Mostrando um aumento considerável nos erros grandes.

O valor de $R^2$ foi de 0.77, mostrando que ainda explica boa parte da variância nos dados de teste, mas com desempenho inferior do que com os dados de treino.

O modelo se saiu muito bem com a base de treino, porém seu desempenho caiu quando foi  utilizada a base de teste, sugerindo um pouco de overfitting.
Overfitting é justamente quando um modelo se sai muito bem com os dados de treino, porém seu desempenho cai com os dados de treino.

___
<div id='regressao'></div>

## Modelo de regressão


Nessa etapa será aplicado à base de dados o mesmo modelo de regressão linear visto na <strong>AULA 28</strong>

In [19]:
# Função utilitária para fazer a regressão 
# com constante adicionada

def regress(Y,X):
    '''
    Y: coluna do DataFrame utilizada como variável resposta (TARGET)
    X: coluna(s) do DataFrame utilizadas como variável(is) explicativas (FEATURES)
    '''
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp)
    results = model.fit()
    
    return results

A seguir serão removidas as colunas da base de dados `x_train` que possuirem valor-p menor do que o nível de significância ($\alpha$) estabelecido pelo grupo como 5%

In [20]:
regressao = regress(y_train, x_train)
regressao.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8389.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:19,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117311,BIC:,2089000.0
Df Model:,59,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4807.0563,60.660,79.247,0.000,4688.165,4925.948
Idade,1.2426,0.469,2.650,0.008,0.323,2.162
Estado_Acre,221.7339,41.000,5.408,0.000,141.375,302.093
Estado_Alagoas,161.2207,28.457,5.665,0.000,105.446,216.995
Estado_Amapá,240.6047,52.879,4.550,0.000,136.964,344.246
Estado_Amazonas,224.6591,31.548,7.121,0.000,162.826,286.493
Estado_Bahia,153.2930,25.625,5.982,0.000,103.069,203.517
Estado_Ceará,146.0079,25.990,5.618,0.000,95.069,196.947
Estado_Distrito Federal,195.7662,34.449,5.683,0.000,128.247,263.285

0,1,2,3
Omnibus:,396870.444,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171471164964.83
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5923.078,Cond. No.,1.82e+16


A seguir serão criados novos dataframes nos quais, a cada uso da função `regress()` será removida a coluna que possuir maior valor na coluna `P>|t|`.<br>
Nesse caso, será a coluna: `Escolaridade_Creche (disponível apenas no questionário anual de educação)`

In [21]:
x_train2 = x_train.drop(columns=["Escolaridade_Creche (disponível apenas no questionário anual de educação)"])

O processo será repetido diversas vezes até que todos os valores da coluna sejam menores do que 5%

In [22]:
regressao2 = regress(y_train, x_train2)
regressao2.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8389.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:20,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117311,BIC:,2089000.0
Df Model:,59,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4975.1208,445.698,11.163,0.000,4101.560,5848.682
Idade,1.2426,0.469,2.650,0.008,0.323,2.162
Estado_Acre,227.9585,43.980,5.183,0.000,141.758,314.159
Estado_Alagoas,167.4453,32.848,5.098,0.000,103.064,231.826
Estado_Amapá,246.8293,55.361,4.459,0.000,138.322,355.336
Estado_Amazonas,230.8837,35.557,6.493,0.000,161.192,300.576
Estado_Bahia,159.5176,30.438,5.241,0.000,99.859,219.177
Estado_Ceará,152.2325,30.649,4.967,0.000,92.161,212.304
Estado_Distrito Federal,201.9908,38.145,5.295,0.000,127.227,276.755

0,1,2,3
Omnibus:,396870.444,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171471164964.83
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5923.078,Cond. No.,1.19e+16


In [23]:
x_train3 = x_train2.drop(columns=['Escolaridade_Superior - graduação'])
regressao3 = regress(y_train, x_train3)
regressao3.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8533.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:21,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117312,BIC:,2089000.0
Df Model:,58,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4991.5914,54.472,91.636,0.000,4884.827,5098.356
Idade,1.2425,0.469,2.649,0.008,0.323,2.162
Estado_Acre,228.5515,40.995,5.575,0.000,148.202,308.901
Estado_Alagoas,168.0573,28.438,5.910,0.000,112.318,223.796
Estado_Amapá,247.4420,52.859,4.681,0.000,143.838,351.046
Estado_Amazonas,231.4962,31.524,7.344,0.000,169.710,293.282
Estado_Bahia,160.1303,25.607,6.253,0.000,109.942,210.319
Estado_Ceará,152.8382,25.975,5.884,0.000,101.927,203.749
Estado_Distrito Federal,202.6021,34.431,5.884,0.000,135.118,270.087

0,1,2,3
Omnibus:,396870.43,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171471114283.209
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5923.077,Cond. No.,1.71e+16


In [24]:
x_train4 = x_train3.drop(columns=['Escolaridade_Antigo científico, clássico, etc. (médio 2º ciclo)'])
regressao4 = regress(y_train, x_train4)
regressao4.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8683.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:22,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117313,BIC:,2089000.0
Df Model:,57,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4991.4857,54.469,91.640,0.000,4884.728,5098.243
Idade,1.2543,0.464,2.702,0.007,0.345,2.164
Estado_Acre,228.5358,40.995,5.575,0.000,148.187,308.884
Estado_Alagoas,168.0793,28.438,5.910,0.000,112.341,223.817
Estado_Amapá,247.3501,52.857,4.680,0.000,143.752,350.948
Estado_Amazonas,231.4609,31.523,7.343,0.000,169.676,293.245
Estado_Bahia,160.1229,25.607,6.253,0.000,109.934,210.311
Estado_Ceará,152.8351,25.975,5.884,0.000,101.924,203.746
Estado_Distrito Federal,202.6480,34.430,5.886,0.000,135.166,270.130

0,1,2,3
Omnibus:,396871.957,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171474245912.279
Skew:,61.374,Prob(JB):,0.0
Kurtosis:,5923.131,Cond. No.,1.49e+16


In [25]:
x_train5 = x_train4.drop(columns=['Escolaridade_Pré-escola'])
regressao5 = regress(y_train, x_train5)
regressao5.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8838.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:22,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117314,BIC:,2089000.0
Df Model:,56,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4991.4146,54.468,91.639,0.000,4884.657,5098.172
Idade,1.2345,0.464,2.663,0.008,0.326,2.143
Estado_Acre,228.6609,40.994,5.578,0.000,148.313,309.009
Estado_Alagoas,168.1846,28.438,5.914,0.000,112.447,223.922
Estado_Amapá,247.4498,52.856,4.682,0.000,143.852,351.048
Estado_Amazonas,231.5924,31.523,7.347,0.000,169.809,293.376
Estado_Bahia,159.8057,25.604,6.241,0.000,109.622,209.989
Estado_Ceará,152.9491,25.975,5.888,0.000,102.039,203.859
Estado_Distrito Federal,202.7131,34.430,5.888,0.000,135.231,270.195

0,1,2,3
Omnibus:,396868.973,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171463951805.49
Skew:,61.373,Prob(JB):,0.0
Kurtosis:,5922.954,Cond. No.,1.63e+16


In [26]:
x_train6 = x_train5.drop(columns=['Escolaridade_Mestrado'])
regressao6 = regress(y_train, x_train6)
regressao6.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8999.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:23,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117315,BIC:,2089000.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4993.3221,54.449,91.706,0.000,4886.602,5100.042
Idade,1.2377,0.464,2.669,0.008,0.329,2.146
Estado_Acre,228.3624,40.994,5.571,0.000,148.015,308.710
Estado_Alagoas,168.4507,28.437,5.924,0.000,112.714,224.187
Estado_Amapá,247.1351,52.856,4.676,0.000,143.538,350.732
Estado_Amazonas,231.8520,31.522,7.355,0.000,170.069,293.635
Estado_Bahia,160.1856,25.603,6.257,0.000,110.005,210.366
Estado_Ceará,153.3313,25.973,5.903,0.000,102.424,204.239
Estado_Distrito Federal,203.5570,34.424,5.913,0.000,136.086,271.028

0,1,2,3
Omnibus:,396852.795,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171405715435.619
Skew:,61.365,Prob(JB):,0.0
Kurtosis:,5921.948,Cond. No.,1.74e+16


In [27]:
x_train7 = x_train6.drop(columns=['Cor_ou_Raça_Ignorado'])
regressao7 = regress(y_train, x_train7)
regressao7.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,8999.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:24,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117315,BIC:,2089000.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5418.1345,341.650,15.859,0.000,4748.505,6087.764
Idade,1.2377,0.464,2.669,0.008,0.329,2.146
Estado_Acre,244.0962,42.869,5.694,0.000,160.073,328.119
Estado_Alagoas,184.1845,31.072,5.928,0.000,123.284,245.084
Estado_Amapá,262.8689,54.326,4.839,0.000,156.390,369.348
Estado_Amazonas,247.5858,33.937,7.295,0.000,181.070,314.102
Estado_Bahia,175.9194,28.504,6.172,0.000,120.052,231.786
Estado_Ceará,169.0651,28.834,5.863,0.000,112.551,225.579
Estado_Distrito Federal,219.2907,36.432,6.019,0.000,147.884,290.698

0,1,2,3
Omnibus:,396852.795,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171405715435.618
Skew:,61.365,Prob(JB):,0.0
Kurtosis:,5921.948,Cond. No.,1.73e+16


In [28]:
x_train8 = x_train7.drop(columns=['Cor_ou_Raça_Amarela'])
regressao8 = regress(y_train, x_train8)
regressao8.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9165.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:25,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117316,BIC:,2089000.0
Df Model:,54,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5400.7010,37.002,145.958,0.000,5328.178,5473.224
Idade,1.2377,0.464,2.670,0.008,0.329,2.146
Estado_Acre,243.4459,40.954,5.944,0.000,163.176,323.716
Estado_Alagoas,183.5366,28.392,6.464,0.000,127.888,239.185
Estado_Amapá,262.2196,52.833,4.963,0.000,158.667,365.772
Estado_Amazonas,246.9380,31.503,7.838,0.000,185.192,308.684
Estado_Bahia,175.2719,25.560,6.857,0.000,125.174,225.370
Estado_Ceará,168.4182,25.934,6.494,0.000,117.589,219.248
Estado_Distrito Federal,218.6780,34.421,6.353,0.000,151.214,286.142

0,1,2,3
Omnibus:,396852.814,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171405790535.335
Skew:,61.365,Prob(JB):,0.0
Kurtosis:,5921.949,Cond. No.,1.48e+16


In [29]:
x_train9 = x_train8.drop(columns=['Cor_ou_Raça_Indigena'])
regressao9 = regress(y_train, x_train9)
regressao9.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9338.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:26,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117317,BIC:,2089000.0
Df Model:,53,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5417.0692,29.912,181.097,0.000,5358.441,5475.697
Idade,1.2364,0.464,2.667,0.008,0.328,2.145
Estado_Acre,243.8527,40.951,5.955,0.000,163.590,324.115
Estado_Alagoas,184.0742,28.383,6.485,0.000,128.444,239.705
Estado_Amapá,262.7496,52.828,4.974,0.000,159.207,366.292
Estado_Amazonas,248.7770,31.408,7.921,0.000,187.218,310.336
Estado_Bahia,175.8651,25.548,6.884,0.000,125.791,225.939
Estado_Ceará,169.0966,25.918,6.524,0.000,118.298,219.895
Estado_Distrito Federal,219.1920,34.414,6.369,0.000,151.741,286.643

0,1,2,3
Omnibus:,396853.084,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171406084120.055
Skew:,61.366,Prob(JB):,0.0
Kurtosis:,5921.954,Cond. No.,1.25e+16


In [30]:
x_train10 = x_train9.drop(columns=['Escolaridade_Alfabetização de jovens e adultos'])
regressao9 = regress(y_train, x_train10)
regressao9.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9518.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:27,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117318,BIC:,2089000.0
Df Model:,52,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5416.8709,29.912,181.091,0.000,5358.243,5475.499
Idade,1.1925,0.463,2.577,0.010,0.286,2.099
Estado_Acre,243.7344,40.951,5.952,0.000,163.472,323.997
Estado_Alagoas,184.0403,28.383,6.484,0.000,128.410,239.671
Estado_Amapá,263.4539,52.827,4.987,0.000,159.915,366.993
Estado_Amazonas,249.1572,31.407,7.933,0.000,187.599,310.715
Estado_Bahia,175.6123,25.548,6.874,0.000,125.539,225.686
Estado_Ceará,169.0555,25.918,6.523,0.000,118.257,219.854
Estado_Distrito Federal,219.4603,34.414,6.377,0.000,152.010,286.911

0,1,2,3
Omnibus:,396845.991,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171381792703.639
Skew:,61.362,Prob(JB):,0.0
Kurtosis:,5921.535,Cond. No.,1.47e+16


In [31]:
x_train11 = x_train10.drop(columns=['Cor_ou_Raça_Parda'])
regressao10 = regress(y_train, x_train11)
regressao10.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9704.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:27,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117319,BIC:,2089000.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5463.8472,15.196,359.567,0.000,5434.064,5493.630
Idade,1.1818,0.463,2.554,0.011,0.275,2.089
Estado_Acre,245.4136,40.941,5.994,0.000,165.170,325.657
Estado_Alagoas,185.9626,28.364,6.556,0.000,130.370,241.555
Estado_Amapá,265.6601,52.813,5.030,0.000,162.147,369.173
Estado_Amazonas,249.4182,31.407,7.941,0.000,187.861,310.976
Estado_Bahia,177.4579,25.528,6.952,0.000,127.424,227.492
Estado_Ceará,171.2651,25.890,6.615,0.000,120.521,222.009
Estado_Distrito Federal,221.5255,34.395,6.441,0.000,154.111,288.940

0,1,2,3
Omnibus:,396847.125,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171382653390.806
Skew:,61.363,Prob(JB):,0.0
Kurtosis:,5921.55,Cond. No.,1.71e+16


In [32]:
x_train12 = x_train11.drop(columns=['Cor_ou_Raça_Preta'])
regressao12 = regress(y_train, x_train12)
regressao12.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,9898.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:28,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117320,BIC:,2089000.0
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5465.7180,15.122,361.440,0.000,5436.079,5495.357
Idade,1.1898,0.463,2.572,0.010,0.283,2.097
Estado_Acre,244.2887,40.931,5.968,0.000,164.065,324.513
Estado_Alagoas,185.3557,28.360,6.536,0.000,129.771,240.941
Estado_Amapá,265.4202,52.813,5.026,0.000,161.908,368.933
Estado_Amazonas,247.4172,31.367,7.888,0.000,185.939,308.895
Estado_Bahia,179.8237,25.458,7.064,0.000,129.926,229.721
Estado_Ceará,170.2628,25.878,6.580,0.000,119.543,220.983
Estado_Distrito Federal,222.4260,34.388,6.468,0.000,155.026,289.826

0,1,2,3
Omnibus:,396850.374,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171389338795.004
Skew:,61.364,Prob(JB):,0.0
Kurtosis:,5921.665,Cond. No.,1.19e+16


### Removendo a coluna de Raça

Após o processo de remoção das colunas, percebe-se que a coluna `Cor_ou_Raça` possui apenas um único valor (no caso, aqueles de raça branca). Tendo isso em mente, o grupo optou por remover a coluna `Cor_ou_Raça` por completo da base de dados.

In [33]:
x_train13 = x_train12.drop(columns=['Cor_ou_Raça_Branca'])
regressao12 = regress(y_train, x_train13)
regressao12.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,10100.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:10:29,Log-Likelihood:,-1044000.0
No. Observations:,117371,AIC:,2088000.0
Df Residuals:,117321,BIC:,2089000.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5474.8072,14.913,367.106,0.000,5445.577,5504.037
Idade,1.2746,0.462,2.758,0.006,0.369,2.180
Estado_Acre,237.8496,40.895,5.816,0.000,157.697,318.002
Estado_Alagoas,183.3072,28.356,6.465,0.000,127.730,238.884
Estado_Amapá,257.0930,52.766,4.872,0.000,153.673,360.513
Estado_Amazonas,240.5786,31.311,7.683,0.000,179.209,301.949
Estado_Bahia,174.6336,25.419,6.870,0.000,124.813,224.455
Estado_Ceará,168.3028,25.873,6.505,0.000,117.591,219.014
Estado_Distrito Federal,221.5186,34.389,6.442,0.000,154.117,288.920

0,1,2,3
Omnibus:,396848.586,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171387673311.464
Skew:,61.364,Prob(JB):,0.0
Kurtosis:,5921.636,Cond. No.,1.5e+16


### Atualizando a base de dados de teste

Após a remoção das colunas da base de dados de treino, será feito o mesmo processo, porém com a base de dados de teste

In [39]:
regressao_teste = regress(y_test, x_test)
regressao_teste.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,5488.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:20:40,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57750,BIC:,1011000.0
Df Model:,59,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4609.7279,70.872,65.043,0.000,4470.818,4748.637
Idade,3.3593,0.574,5.852,0.000,2.234,4.485
Estado_Acre,211.3621,51.073,4.138,0.000,111.258,311.466
Estado_Alagoas,174.4072,35.418,4.924,0.000,104.989,243.826
Estado_Amapá,233.8565,67.151,3.483,0.000,102.241,365.472
Estado_Amazonas,187.7289,39.261,4.782,0.000,110.778,264.680
Estado_Bahia,152.9243,31.490,4.856,0.000,91.204,214.644
Estado_Ceará,117.8755,31.673,3.722,0.000,55.797,179.954
Estado_Distrito Federal,69.6706,42.022,1.658,0.097,-12.693,152.034

0,1,2,3
Omnibus:,173116.786,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28578332105.238
Skew:,43.395,Prob(JB):,0.0
Kurtosis:,3446.378,Cond. No.,3.68e+16


In [40]:
x_test2 = x_test.drop(columns=['Cor_ou_Raça_Ignorado'])
regressao_teste2 = regress(y_test, x_test2)
regressao_teste2.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,5488.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:22:27,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57750,BIC:,1011000.0
Df Model:,59,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4678.2215,307.850,15.196,0.000,4074.835,5281.608
Idade,3.3593,0.574,5.852,0.000,2.234,4.485
Estado_Acre,213.8989,52.282,4.091,0.000,111.426,316.372
Estado_Alagoas,176.9440,37.120,4.767,0.000,104.188,249.700
Estado_Amapá,236.3933,68.073,3.473,0.001,102.970,369.817
Estado_Amazonas,190.2657,40.820,4.661,0.000,110.258,270.273
Estado_Bahia,155.4611,33.398,4.655,0.000,90.001,220.922
Estado_Ceará,120.4123,33.499,3.594,0.000,54.754,186.071
Estado_Distrito Federal,72.2074,43.497,1.660,0.097,-13.047,157.462

0,1,2,3
Omnibus:,173116.786,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28578332105.238
Skew:,43.395,Prob(JB):,0.0
Kurtosis:,3446.378,Cond. No.,1.05e+16


In [42]:
x_test3 = x_test2.drop(columns=['Escolaridade_Creche (disponível apenas no questionário anual de educação)'])
regressao_teste3 = regress(y_test, x_test3)
regressao_teste3.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,5488.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:23:36,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57750,BIC:,1011000.0
Df Model:,59,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4930.2552,886.738,5.560,0.000,3192.245,6668.265
Idade,3.3593,0.574,5.852,0.000,2.234,4.485
Estado_Acre,223.2335,60.699,3.678,0.000,104.263,342.204
Estado_Alagoas,186.2786,48.252,3.861,0.000,91.705,280.852
Estado_Amapá,245.7279,74.730,3.288,0.001,99.257,392.199
Estado_Amazonas,199.6003,50.571,3.947,0.000,100.480,298.720
Estado_Bahia,164.7957,45.445,3.626,0.000,75.724,253.867
Estado_Ceará,129.7469,45.519,2.850,0.004,40.529,218.965
Estado_Distrito Federal,81.5420,53.327,1.529,0.126,-22.979,186.063

0,1,2,3
Omnibus:,173116.786,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28578332105.239
Skew:,43.395,Prob(JB):,0.0
Kurtosis:,3446.378,Cond. No.,1.05e+16


In [44]:
x_test4 = x_test3.drop(columns=['Escolaridade_Superior - graduação', 'Escolaridade_Especialização de nível superior'])
regressao_teste4 = regress(y_test, x_test4)
regressao_teste4.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,5680.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:26:05,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57752,BIC:,1011000.0
Df Model:,57,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4921.9440,313.942,15.678,0.000,4306.616,5537.272
Idade,3.3792,0.574,5.888,0.000,2.254,4.504
Estado_Acre,221.6736,52.320,4.237,0.000,119.127,324.221
Estado_Alagoas,186.0921,37.190,5.004,0.000,113.200,258.984
Estado_Amapá,243.1472,68.087,3.571,0.000,109.697,376.597
Estado_Amazonas,199.6265,40.910,4.880,0.000,119.443,279.810
Estado_Bahia,164.8564,33.476,4.925,0.000,99.243,230.470
Estado_Ceará,129.5164,33.571,3.858,0.000,63.718,195.315
Estado_Distrito Federal,83.1059,43.534,1.909,0.056,-2.222,168.434

0,1,2,3
Omnibus:,173096.99,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28552042838.981
Skew:,43.381,Prob(JB):,0.0
Kurtosis:,3444.793,Cond. No.,4e+16


In [45]:
x_test5 = x_test4.drop(columns=['Escolaridade_Mestrado'])
regressao_teste5 = regress(y_test, x_test5)
regressao_teste5.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,5782.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:27:07,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57753,BIC:,1011000.0
Df Model:,56,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4917.7524,313.891,15.667,0.000,4302.525,5532.980
Idade,3.3766,0.574,5.883,0.000,2.252,4.502
Estado_Acre,221.5854,52.319,4.235,0.000,119.039,324.132
Estado_Alagoas,185.9858,37.189,5.001,0.000,113.094,258.877
Estado_Amapá,243.1571,68.086,3.571,0.000,109.707,376.607
Estado_Amazonas,199.4626,40.909,4.876,0.000,119.281,279.644
Estado_Bahia,164.6621,33.475,4.919,0.000,99.051,230.273
Estado_Ceará,129.2899,33.569,3.851,0.000,63.494,195.086
Estado_Distrito Federal,82.3839,43.524,1.893,0.058,-2.922,167.690

0,1,2,3
Omnibus:,173103.542,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28559238630.553
Skew:,43.386,Prob(JB):,0.0
Kurtosis:,3445.227,Cond. No.,4.17e+16


In [47]:
x_test6 = x_test5.drop(columns=['Cor_ou_Raça_Amarela'])
regressao_teste6 = regress(y_test, x_test6)
regressao_teste6.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,5887.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:28:36,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57754,BIC:,1011000.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5311.2846,46.460,114.321,0.000,5220.224,5402.345
Idade,3.3734,0.574,5.878,0.000,2.248,4.498
Estado_Acre,236.3815,51.001,4.635,0.000,136.419,336.344
Estado_Alagoas,200.6033,35.357,5.674,0.000,131.304,269.903
Estado_Amapá,257.7852,67.102,3.842,0.000,126.265,389.305
Estado_Amazonas,214.0650,39.254,5.453,0.000,137.127,291.003
Estado_Bahia,179.2193,31.444,5.700,0.000,117.589,240.850
Estado_Ceará,143.5948,31.616,4.542,0.000,81.628,205.562
Estado_Distrito Federal,97.2106,41.923,2.319,0.020,15.042,179.380

0,1,2,3
Omnibus:,173107.68,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28561724710.485
Skew:,43.389,Prob(JB):,0.0
Kurtosis:,3445.377,Cond. No.,1.16e+16


In [48]:
x_test7 = x_test6.drop(columns=['Cor_ou_Raça_Preta'])
regressao_teste7 = regress(y_test, x_test7)
regressao_teste7.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,5996.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:29:50,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57755,BIC:,1011000.0
Df Model:,54,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5331.9516,20.402,261.347,0.000,5291.964,5371.939
Idade,3.3696,0.574,5.872,0.000,2.245,4.494
Estado_Acre,236.7471,50.996,4.643,0.000,136.796,336.699
Estado_Alagoas,201.3193,35.327,5.699,0.000,132.078,270.560
Estado_Amapá,258.6712,67.078,3.856,0.000,127.199,390.144
Estado_Amazonas,214.8486,39.222,5.478,0.000,137.974,291.723
Estado_Bahia,180.4888,31.339,5.759,0.000,119.064,241.914
Estado_Ceará,144.2512,31.588,4.567,0.000,82.339,206.164
Estado_Distrito Federal,97.8955,41.900,2.336,0.019,15.772,180.019

0,1,2,3
Omnibus:,173108.356,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28562326742.65
Skew:,43.389,Prob(JB):,0.0
Kurtosis:,3445.413,Cond. No.,2.07e+16


In [49]:
x_test8 = x_test7.drop(columns=['Cor_ou_Raça_Indigena'])
regressao_teste8 = regress(y_test, x_test8)
regressao_teste8.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6109.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:30:41,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57756,BIC:,1011000.0
Df Model:,53,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5332.8165,20.284,262.902,0.000,5293.059,5372.574
Idade,3.3684,0.574,5.870,0.000,2.244,4.493
Estado_Acre,236.8063,50.995,4.644,0.000,136.856,336.757
Estado_Alagoas,201.3029,35.327,5.698,0.000,132.062,270.544
Estado_Amapá,258.6487,67.077,3.856,0.000,127.177,390.120
Estado_Amazonas,215.7389,39.157,5.510,0.000,138.991,292.487
Estado_Bahia,180.3603,31.337,5.755,0.000,118.939,241.781
Estado_Ceará,144.4036,31.585,4.572,0.000,82.496,206.311
Estado_Distrito Federal,97.8119,41.899,2.334,0.020,15.690,179.934

0,1,2,3
Omnibus:,173108.291,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28562226663.985
Skew:,43.389,Prob(JB):,0.0
Kurtosis:,3445.407,Cond. No.,6.05e+16


In [52]:
x_test9 = x_test8.drop(columns=['Cor_ou_Raça_Parda'])
regressao_teste9 = regress(y_test, x_test9)
regressao_teste9.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6227.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:31:36,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57757,BIC:,1011000.0
Df Model:,52,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5341.3283,17.919,298.076,0.000,5306.206,5376.450
Idade,3.3587,0.574,5.854,0.000,2.234,4.483
Estado_Acre,238.1005,50.974,4.671,0.000,138.191,338.010
Estado_Alagoas,202.2806,35.310,5.729,0.000,133.073,271.488
Estado_Amapá,259.6524,67.068,3.872,0.000,128.200,391.105
Estado_Amazonas,217.3424,39.116,5.556,0.000,140.675,294.010
Estado_Bahia,178.3757,31.259,5.706,0.000,117.109,239.643
Estado_Ceará,145.7097,31.552,4.618,0.000,83.869,207.551
Estado_Distrito Federal,97.7170,41.899,2.332,0.020,15.595,179.839

0,1,2,3
Omnibus:,173109.452,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28563942460.223
Skew:,43.39,Prob(JB):,0.0
Kurtosis:,3445.51,Cond. No.,6.17e+16


In [54]:
x_test10 = x_test9.drop(columns=['Cor_ou_Raça_Branca'])
regressao_teste10 = regress(y_test, x_test10)
regressao_teste10.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6349.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:32:21,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57758,BIC:,1011000.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5345.3405,17.664,302.605,0.000,5310.718,5379.963
Idade,3.4034,0.573,5.942,0.000,2.281,4.526
Estado_Acre,234.8614,50.917,4.613,0.000,135.064,334.658
Estado_Alagoas,201.3005,35.302,5.702,0.000,132.108,270.493
Estado_Amapá,256.7742,67.033,3.831,0.000,125.389,388.160
Estado_Amazonas,213.9997,39.036,5.482,0.000,137.490,290.510
Estado_Bahia,175.8552,31.202,5.636,0.000,114.700,237.010
Estado_Ceará,144.5687,31.540,4.584,0.000,82.750,206.387
Estado_Distrito Federal,97.0017,41.896,2.315,0.021,14.886,179.117

0,1,2,3
Omnibus:,173096.397,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28545241760.453
Skew:,43.381,Prob(JB):,0.0
Kurtosis:,3444.383,Cond. No.,6.1e+16


In [55]:
x_test11 = x_test10.drop(columns=['Escolaridade_Doutorado'])
regressao_teste11 = regress(y_test, x_test11)
regressao_teste11.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6476.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:32:55,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57759,BIC:,1011000.0
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5346.7324,17.635,303.196,0.000,5312.169,5381.296
Idade,3.4173,0.573,5.967,0.000,2.295,4.540
Estado_Acre,234.8068,50.917,4.612,0.000,135.009,334.604
Estado_Alagoas,201.1060,35.302,5.697,0.000,131.913,270.299
Estado_Amapá,257.2484,67.033,3.838,0.000,125.864,388.633
Estado_Amazonas,214.0511,39.036,5.483,0.000,137.541,290.562
Estado_Bahia,175.8760,31.202,5.637,0.000,114.720,237.032
Estado_Ceará,144.5678,31.540,4.584,0.000,82.749,206.387
Estado_Distrito Federal,97.2242,41.896,2.321,0.020,15.109,179.340

0,1,2,3
Omnibus:,173082.939,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28530614476.936
Skew:,43.372,Prob(JB):,0.0
Kurtosis:,3443.501,Cond. No.,7.32e+16


In [56]:
x_test12 = x_test11.drop(columns=['Escolaridade_Pré-escola'])
regressao_teste12 = regress(y_test, x_test12)
regressao_teste12.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6608.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:33:37,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57760,BIC:,1011000.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5346.7983,17.635,303.197,0.000,5312.234,5381.363
Idade,3.3772,0.572,5.903,0.000,2.256,4.499
Estado_Acre,235.0848,50.917,4.617,0.000,135.287,334.883
Estado_Alagoas,200.9369,35.303,5.692,0.000,131.744,270.130
Estado_Amapá,257.7962,67.033,3.846,0.000,126.412,389.180
Estado_Amazonas,214.3003,39.036,5.490,0.000,137.790,290.811
Estado_Bahia,175.0750,31.198,5.612,0.000,113.927,236.223
Estado_Ceará,144.5752,31.541,4.584,0.000,82.755,206.395
Estado_Distrito Federal,97.6698,41.895,2.331,0.020,15.555,179.784

0,1,2,3
Omnibus:,173080.324,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28527560263.11
Skew:,43.37,Prob(JB):,0.0
Kurtosis:,3443.317,Cond. No.,5.11e+16


In [57]:
x_test13 = x_test12.drop(columns=['Escolaridade_Alfabetização de jovens e adultos'])
regressao_teste13 = regress(y_test, x_test13)
regressao_teste13.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6745.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:34:22,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57761,BIC:,1011000.0
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5347.0230,17.635,303.212,0.000,5312.459,5381.587
Idade,3.3053,0.571,5.792,0.000,2.187,4.424
Estado_Acre,235.3457,50.918,4.622,0.000,135.546,335.145
Estado_Alagoas,200.3390,35.302,5.675,0.000,131.148,269.530
Estado_Amapá,258.7586,67.032,3.860,0.000,127.376,390.141
Estado_Amazonas,214.7889,39.036,5.502,0.000,138.279,291.299
Estado_Bahia,175.1935,31.198,5.615,0.000,114.045,236.342
Estado_Ceará,144.3602,31.541,4.577,0.000,82.540,206.181
Estado_Distrito Federal,98.4357,41.894,2.350,0.019,16.324,180.547

0,1,2,3
Omnibus:,173076.701,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28523475320.816
Skew:,43.367,Prob(JB):,0.0
Kurtosis:,3443.071,Cond. No.,1.91e+16


In [58]:
x_test14 = x_test13.drop(columns=['Estado_Roraima'])
regressao_teste13 = regress(y_test, x_test14)
regressao_teste13.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6745.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:35:11,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57761,BIC:,1011000.0
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5406.7688,37.307,144.925,0.000,5333.646,5479.891
Idade,3.3053,0.571,5.792,0.000,2.187,4.424
Estado_Acre,128.3011,79.224,1.619,0.105,-26.978,283.580
Estado_Alagoas,93.2945,69.478,1.343,0.179,-42.882,229.471
Estado_Amapá,151.7141,91.245,1.663,0.096,-27.127,330.555
Estado_Amazonas,107.7444,71.585,1.505,0.132,-32.562,248.051
Estado_Bahia,68.1489,67.354,1.012,0.312,-63.866,200.164
Estado_Ceará,37.3157,67.521,0.553,0.581,-95.026,169.657
Estado_Distrito Federal,-8.6088,73.339,-0.117,0.907,-152.353,135.135

0,1,2,3
Omnibus:,173076.701,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28523475320.816
Skew:,43.367,Prob(JB):,0.0
Kurtosis:,3443.071,Cond. No.,3.53e+16


In [60]:
x_test15 = x_test14.drop(columns=['Estado_Distrito Federal'])
regressao_teste15 = regress(y_test, x_test15)
regressao_teste15.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,6889.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:36:39,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57762,BIC:,1011000.0
Df Model:,47,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5403.5597,25.386,212.855,0.000,5353.803,5453.317
Idade,3.3050,0.571,5.791,0.000,2.186,4.424
Estado_Acre,133.9699,62.803,2.133,0.033,10.876,257.064
Estado_Alagoas,98.9535,50.030,1.978,0.048,0.895,197.012
Estado_Amapá,157.3848,77.407,2.033,0.042,5.666,309.103
Estado_Amazonas,113.4028,52.922,2.143,0.032,9.676,217.130
Estado_Bahia,73.8047,47.066,1.568,0.117,-18.444,166.054
Estado_Ceará,42.9737,47.284,0.909,0.363,-49.704,135.651
Estado_Espirito Santo,107.5620,47.518,2.264,0.024,14.426,200.698

0,1,2,3
Omnibus:,173076.904,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28523695427.217
Skew:,43.368,Prob(JB):,0.0
Kurtosis:,3443.084,Cond. No.,4.2e+16


In [61]:
x_test16 = x_test15.drop(columns=['Estado_Piaui'])
regressao_teste16 = regress(y_test, x_test16)
regressao_teste16.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,7038.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:37:21,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57763,BIC:,1011000.0
Df Model:,46,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5410.1710,22.792,237.368,0.000,5365.498,5454.844
Idade,3.3083,0.571,5.797,0.000,2.190,4.427
Estado_Acre,121.4301,59.115,2.054,0.040,5.564,237.297
Estado_Alagoas,86.2514,45.185,1.909,0.056,-2.311,174.814
Estado_Amapá,144.8155,74.432,1.946,0.052,-1.072,290.703
Estado_Amazonas,100.7214,48.383,2.082,0.037,5.890,195.552
Estado_Bahia,61.0191,41.807,1.460,0.144,-20.923,142.961
Estado_Ceará,30.2043,42.067,0.718,0.473,-52.247,112.655
Estado_Espirito Santo,95.1533,42.635,2.232,0.026,11.588,178.718

0,1,2,3
Omnibus:,173078.577,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28525399157.542
Skew:,43.369,Prob(JB):,0.0
Kurtosis:,3443.187,Cond. No.,3.05e+16


In [62]:
x_test17 = x_test16.drop(columns=['Estado_Pará'])
regressao_teste17 = regress(y_test, x_test17)
regressao_teste17.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,7195.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:38:12,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57764,BIC:,1011000.0
Df Model:,45,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5413.9745,20.894,259.118,0.000,5373.022,5454.927
Idade,3.3073,0.571,5.796,0.000,2.189,4.426
Estado_Acre,114.2461,56.557,2.020,0.043,3.393,225.099
Estado_Alagoas,79.0143,41.730,1.893,0.058,-2.777,160.806
Estado_Amapá,137.6391,72.421,1.901,0.057,-4.307,279.585
Estado_Amazonas,93.4919,45.180,2.069,0.039,4.938,182.046
Estado_Bahia,53.7487,38.010,1.414,0.157,-20.752,128.249
Estado_Ceará,22.9501,38.314,0.599,0.549,-52.145,98.045
Estado_Espirito Santo,88.0030,39.047,2.254,0.024,11.471,164.535

0,1,2,3
Omnibus:,173079.681,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28526463273.34
Skew:,43.369,Prob(JB):,0.0
Kurtosis:,3443.251,Cond. No.,3.8e+16


In [63]:
x_test18 = x_test17.drop(columns=['Estado_Sergipe'])
regressao_teste18 = regress(y_test, x_test18)
regressao_teste18.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,7358.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:38:59,Log-Likelihood:,-505390.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57765,BIC:,1011000.0
Df Model:,44,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5416.4175,20.406,265.429,0.000,5376.421,5456.414
Idade,3.3084,0.571,5.798,0.000,2.190,4.427
Estado_Acre,109.5554,55.897,1.960,0.050,-0.003,219.114
Estado_Alagoas,74.2167,40.789,1.820,0.069,-5.730,154.163
Estado_Amapá,132.9329,71.903,1.849,0.064,-7.998,273.863
Estado_Amazonas,88.7399,44.329,2.002,0.045,1.855,175.625
Estado_Bahia,48.9108,36.957,1.323,0.186,-23.525,121.346
Estado_Ceará,18.1218,37.273,0.486,0.627,-54.933,91.177
Estado_Espirito Santo,83.3541,38.102,2.188,0.029,8.675,158.033

0,1,2,3
Omnibus:,173080.439,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28527124728.674
Skew:,43.37,Prob(JB):,0.0
Kurtosis:,3443.291,Cond. No.,1.51e+16


In [64]:
x_test19 = x_test18.drop(columns=['Estado_Ceará'])
regressao_teste19 = regress(y_test, x_test19)
regressao_teste19.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,7530.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:39:58,Log-Likelihood:,-505400.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57766,BIC:,1011000.0
Df Model:,43,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5419.0218,19.691,275.209,0.000,5380.428,5457.615
Idade,3.3102,0.571,5.801,0.000,2.192,4.429
Estado_Acre,104.5588,54.944,1.903,0.057,-3.131,212.249
Estado_Alagoas,69.1307,39.424,1.753,0.080,-8.141,146.403
Estado_Amapá,127.9127,71.157,1.798,0.072,-11.556,267.382
Estado_Amazonas,83.6511,43.075,1.942,0.052,-0.777,168.079
Estado_Bahia,43.7633,35.408,1.236,0.216,-25.636,113.162
Estado_Espirito Santo,78.4130,36.721,2.135,0.033,6.440,150.386
Estado_Goiás,221.7526,36.694,6.043,0.000,149.832,293.673

0,1,2,3
Omnibus:,173081.343,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28528069651.387
Skew:,43.371,Prob(JB):,0.0
Kurtosis:,3443.348,Cond. No.,1.12e+16


In [68]:
x_test20 = x_test19.drop(columns=['Estado_Rio Grande do Norte'])
regressao_teste20 = regress(y_test, x_test20)
regressao_teste20.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,7709.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:43:15,Log-Likelihood:,-505400.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57767,BIC:,1011000.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5420.5190,19.497,278.018,0.000,5382.305,5458.733
Idade,3.3134,0.571,5.807,0.000,2.195,4.432
Estado_Acre,101.5966,54.673,1.858,0.063,-5.562,208.755
Estado_Alagoas,66.1587,39.043,1.694,0.090,-10.367,142.684
Estado_Amapá,124.9520,70.948,1.761,0.078,-14.107,264.011
Estado_Amazonas,80.6902,42.730,1.888,0.059,-3.060,164.441
Estado_Bahia,40.7902,34.983,1.166,0.244,-27.776,109.356
Estado_Espirito Santo,75.4343,36.310,2.078,0.038,4.267,146.602
Estado_Goiás,218.7605,36.279,6.030,0.000,147.654,289.867

0,1,2,3
Omnibus:,173081.879,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28528531765.391
Skew:,43.371,Prob(JB):,0.0
Kurtosis:,3443.376,Cond. No.,1.84e+16


In [69]:
x_test21 = x_test20.drop(columns=['Estado_Rondônia'])
regressao_teste21 = regress(y_test, x_test21)
regressao_teste21.summary()

0,1,2,3
Dep. Variable:,Rendimento_Bruto,R-squared:,0.849
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,7897.0
Date:,"Tue, 19 Nov 2024",Prob (F-statistic):,0.0
Time:,14:43:16,Log-Likelihood:,-505400.0
No. Observations:,57810,AIC:,1011000.0
Df Residuals:,57768,BIC:,1011000.0
Df Model:,41,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5422.1668,19.292,281.059,0.000,5384.355,5459.979
Idade,3.3097,0.571,5.801,0.000,2.191,4.428
Estado_Acre,98.5407,54.422,1.811,0.070,-8.126,205.208
Estado_Alagoas,63.2580,38.726,1.633,0.102,-12.646,139.162
Estado_Amapá,121.9607,70.763,1.724,0.085,-16.735,260.657
Estado_Amazonas,77.7375,42.430,1.832,0.067,-5.425,160.900
Estado_Bahia,37.9534,34.644,1.096,0.273,-29.949,105.856
Estado_Espirito Santo,72.3257,35.918,2.014,0.044,1.927,142.725
Estado_Goiás,215.6674,35.890,6.009,0.000,145.322,286.013

0,1,2,3
Omnibus:,173081.341,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28528082859.998
Skew:,43.371,Prob(JB):,0.0
Kurtosis:,3443.348,Cond. No.,1.99e+16


In [71]:
x_test22, perdeu = remove_maior(regressao_teste21, x_test21)

AttributeError: 'Summary' object has no attribute 'pvalues'