# Estatística para Machine Learning

## 4. Medidas de posição e dispersão

### Base de dados

In [1]:
import numpy as np
import statistics
from scipy import stats
import math

In [2]:
dados = np.array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
                  157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
                  163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
                  173])

### Média aritmética simples

In [3]:
dados.sum() / len(dados)

160.375

In [4]:
dados.mean()

160.375

In [5]:
statistics.mean(dados)

160

### Moda

In [6]:
statistics.mode(dados)

160

In [7]:
stats.mode(dados)

ModeResult(mode=array([160]), count=array([5]))

### Mediana

In [8]:
dados_impar = [150, 151, 152, 152, 153, 154, 155, 155, 155]

#### Cálculo manual (ímpar)

In [9]:
posicao = len(dados_impar) / 2 
posicao

4.5

In [10]:
posicao = math.ceil(posicao)
posicao

5

In [11]:
dados_impar[posicao - 1]

153

#### Cálculo manual (par)

In [12]:
posicao = len(dados) // 2
posicao

20

In [13]:
dados[posicao - 1], dados[posicao]

(160, 160)

In [14]:
mediana = (dados[posicao - 1] + dados[posicao]) / 2
mediana

160.0

#### Bibliotecas

In [15]:
np.median(dados_impar)

153.0

In [16]:
np.median(dados)

160.0

In [17]:
statistics.median(dados_impar)

153

In [18]:
statistics.median(dados)

160.0

### Média aritmética ponderada

In [19]:
notas = np.array([9, 8, 7, 3])
pesos = np.array([1, 2, 3, 4])

In [20]:
(9 * 1 + 8 * 2 + 7 * 3 + 3 * 4) / (1 + 2 + 3 + 4)

5.8

In [21]:
media_ponderada = (notas * pesos).sum() / pesos.sum()
media_ponderada

5.8

In [22]:
np.average(notas, weights=pesos)

5.8

### Média aritmética, moda e mediana com distribuição de frequência (dados agrupados)

In [23]:
dados = {'inferior': [150, 154, 158, 162, 166, 170],
         'superior': [154, 158, 162, 166, 170, 174],
         'fi': [5, 9, 11, 7, 5, 3]}

In [24]:
import pandas as pd
dataset = pd.DataFrame(dados)
dataset

Unnamed: 0,inferior,superior,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5
5,170,174,3


In [25]:
dataset['xi'] = (dataset['superior'] + dataset['inferior']) / 2
dataset

Unnamed: 0,inferior,superior,fi,xi
0,150,154,5,152.0
1,154,158,9,156.0
2,158,162,11,160.0
3,162,166,7,164.0
4,166,170,5,168.0
5,170,174,3,172.0


In [26]:
dataset['fi.xi'] = dataset['fi'] * dataset['xi']
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi
0,150,154,5,152.0,760.0
1,154,158,9,156.0,1404.0
2,158,162,11,160.0,1760.0
3,162,166,7,164.0,1148.0
4,166,170,5,168.0,840.0
5,170,174,3,172.0,516.0


In [27]:
dataset['Fi'] = 0
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,0
1,154,158,9,156.0,1404.0,0
2,158,162,11,160.0,1760.0,0
3,162,166,7,164.0,1148.0,0
4,166,170,5,168.0,840.0,0
5,170,174,3,172.0,516.0,0


In [28]:
frequencia_acumulada = []
somatorio = 0
for linha in dataset.iterrows():
  #print(linha[1])
  #print(linha[1][2])
  somatorio += linha[1][2]
  frequencia_acumulada.append(somatorio)

In [29]:
frequencia_acumulada

[5.0, 14.0, 25.0, 32.0, 37.0, 40.0]

In [30]:
dataset['Fi'] = frequencia_acumulada
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


#### Média

In [31]:
dataset['fi'].sum(), dataset['fi.xi'].sum()

(40, 6428.0)

In [32]:
dataset['fi.xi'].sum() / dataset['fi'].sum()

160.7

#### Moda

In [33]:
dataset['fi'].max()

11

In [34]:
dataset[dataset['fi'] == dataset['fi'].max()]

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
2,158,162,11,160.0,1760.0,25.0


In [35]:
dataset[dataset['fi'] == dataset['fi'].max()]['xi'].values[0]

160.0

#### Mediana

In [36]:
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [37]:
fi_2 = dataset['fi'].sum() / 2
fi_2

20.0

In [38]:
limite_inferior, frequencia_classe, id_frequencia_anterior = 0, 0, 0
for linha in dataset.iterrows():
  #print(linha)
  limite_inferior = linha[1][0]
  frequencia_classe = linha[1][2]
  id_frequencia_anterior = linha[0]
  if linha[1][5] >= fi_2:
    id_frequencia_anterior -= 1
    break

In [39]:
limite_inferior, frequencia_classe, id_frequencia_anterior

(158.0, 11.0, 1)

In [40]:
Fi_anterior = dataset.iloc[[id_frequencia_anterior]]['Fi'].values[0]
Fi_anterior

14.0

In [41]:
mediana = limite_inferior + ((fi_2 - Fi_anterior) * 4) / frequencia_classe
mediana

160.1818181818182

#### Função completa

In [42]:
def get_estatisticas(dataframe):
  media = dataset['fi.xi'].sum() / dataset['fi'].sum()
  moda = dataset[dataset['fi'] == dataset['fi'].max()]['xi'].values[0]

  fi_2 = dataset['fi'].sum() / 2
  limite_inferior, frequencia_classe, id_frequencia_anterior = 0, 0, 0
  for i, linha in enumerate(dataset.iterrows()):
    limite_inferior = linha[1][0]
    frequencia_classe = linha[1][2]
    id_frequencia_anterior = linha[0]
    if linha[1][5] >= fi_2:
      id_frequencia_anterior -= 1
      break
  Fi_anterior = dataset.iloc[[id_frequencia_anterior]]['Fi'].values[0]
  mediana = limite_inferior + ((fi_2 - Fi_anterior) * 4) / frequencia_classe

In [43]:
get_estatisticas(dataset)

### Média geométrica, harmônica e quadrática

#### Média geométrica

In [44]:
from scipy.stats.mstats import gmean

In [45]:
gmean(dados)

TypeError: loop of ufunc does not support argument 0 of type dict which has no callable log method

#### Média harmônica

In [46]:
from scipy.stats.mstats import hmean

In [47]:
hmean(dados)

TypeError: '>=' not supported between instances of 'dict' and 'int'

#### Média quadrática

In [48]:
def quadratic_mean(dados):
  return math.sqrt(sum(n * n for n in dados) / len(dados))

In [49]:
quadratic_mean(dados)

TypeError: can't multiply sequence by non-int of type 'str'

### Quartis

In [50]:
dados_impar = [150, 151, 152, 152, 153, 154, 155, 155, 155]

#### Cálculo manual

In [51]:
np.median(dados_impar)

153.0

In [52]:
posicao_mediana = math.floor(len(dados_impar) / 2)
posicao_mediana

4

In [53]:
esquerda = dados_impar[0:posicao_mediana]
esquerda

[150, 151, 152, 152]

In [54]:
np.median(esquerda)

151.5

In [55]:
direita = dados_impar[posicao_mediana + 1:]
direita

[154, 155, 155, 155]

In [56]:
np.median(direita)

155.0

#### Bibliotecas

#### Numpy

In [57]:
np.quantile(dados_impar, 0.5)

153.0

In [58]:
np.quantile(dados_impar, 0.75)

155.0

In [59]:
np.quantile(dados_impar, 0.25)

152.0

In [60]:
esquerda2 = dados_impar[0:posicao_mediana + 1]
esquerda2

[150, 151, 152, 152, 153]

In [61]:
np.median(esquerda2)

152.0

In [62]:
np.quantile(dados, 0.25), np.quantile(dados, 0.50), np.quantile(dados, 0.75)

TypeError: unsupported operand type(s) for -: 'dict' and 'dict'

#### Scipy

In [63]:
stats.scoreatpercentile(dados, 25), stats.scoreatpercentile(dados, 50), stats.scoreatpercentile(dados, 75)

TypeError: unsupported operand type(s) for *: 'dict' and 'int'

#### Pandas

In [64]:
import pandas as pd
dataset = pd.DataFrame(dados)
dataset.head()

Unnamed: 0,inferior,superior,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5


In [65]:
dataset.quantile([0.25, 0.5, 0.75])

Unnamed: 0,inferior,superior,fi
0.25,155.0,159.0,5.0
0.5,160.0,164.0,6.0
0.75,165.0,169.0,8.5


In [66]:
dataset.describe()

Unnamed: 0,inferior,superior,fi
count,6.0,6.0,6.0
mean,160.0,164.0,6.666667
std,7.483315,7.483315,2.94392
min,150.0,154.0,3.0
25%,155.0,159.0,5.0
50%,160.0,164.0,6.0
75%,165.0,169.0,8.5
max,170.0,174.0,11.0


### Quartis com distribuição de frequência

In [74]:
def get_quartil(dataframe, q1 = True):
  if q1 == True:
    fi_4 = dataset['fi'].sum() / 4
  else:
    fi_4 = (3 * dataset['fi'].sum()) / 4
  
  limite_inferior, frequencia_classe, id_frequencia_anterior = 0, 0, 0
  for linha in dataset.iterrows():
    limite_inferior = linha[1][0]
    frequencia_classe = linha[1][2]
    id_frequencia_anterior = linha[0]
    if linha[1][5] >= fi_4:
      id_frequencia_anterior -= 1
      break
  Fi_anterior = dataset.iloc[[id_frequencia_anterior]]['Fi'].values[0]
  q = limite_inferior + ((fi_4 - Fi_anterior) * 4) / frequencia_classe

  return q

In [75]:
dataset

Unnamed: 0,inferior,superior,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5
5,170,174,3


In [76]:
get_quartil(dados), get_quartil(dados, q1 = False)

IndexError: index 5 is out of bounds for axis 0 with size 3

#### Percentis

In [77]:
np.median(dados)

TypeError: unsupported operand type(s) for /: 'dict' and 'int'

In [78]:
np.quantile(dados, 0.5)

TypeError: unsupported operand type(s) for -: 'dict' and 'dict'

In [79]:
np.percentile(dados, 50)

TypeError: unsupported operand type(s) for -: 'dict' and 'dict'

In [80]:
np.percentile(dados, 5), np.percentile(dados, 10), np.percentile(dados, 90)

TypeError: unsupported operand type(s) for -: 'dict' and 'dict'

In [81]:
stats.scoreatpercentile(dados, 5), stats.scoreatpercentile(dados, 10), stats.scoreatpercentile(dados, 90)

TypeError: unsupported operand type(s) for *: 'dict' and 'int'

In [82]:
import pandas as pd
dataset = pd.DataFrame(dados)
dataset.head()

Unnamed: 0,inferior,superior,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5


In [83]:
dataset.quantile([0.05, 0.10, 0.90])

Unnamed: 0,inferior,superior,fi
0.05,151.0,155.0,3.5
0.1,152.0,156.0,4.0
0.9,168.0,172.0,10.0


### Exercício

In [84]:
dataset = pd.read_csv('census.csv')

In [85]:
dataset.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [86]:
dataset['age'].mean()

38.58164675532078

In [87]:
stats.hmean(dataset['age'])

33.91874139089839

In [88]:
from scipy.stats.mstats import gmean
gmean(dataset['age'])

36.210879158177256

In [89]:
quadratic_mean(dataset['age'])

40.9218664329987

In [90]:
dataset['age'].median()

37.0

In [91]:
statistics.mode(dataset['age'])

36

### Medidas de dispersão

#### Amplitude total e diferença interquartil

In [92]:
dados

{'inferior': [150, 154, 158, 162, 166, 170],
 'superior': [154, 158, 162, 166, 170, 174],
 'fi': [5, 9, 11, 7, 5, 3]}

In [93]:
dados.max() - dados.min()

AttributeError: 'dict' object has no attribute 'max'

In [94]:
q1 = np.quantile(dados, 0.25)
q3 = np.quantile(dados, 0.75)
q1, q3

TypeError: unsupported operand type(s) for -: 'dict' and 'dict'

In [95]:
diferenca_interquartil = q3 - q1
diferenca_interquartil

NameError: name 'q3' is not defined

In [96]:
inferior = q1 - (1.5 * diferenca_interquartil)
inferior

NameError: name 'q1' is not defined

In [97]:
superior = q3 + (1.5 * diferenca_interquartil)
superior

NameError: name 'q3' is not defined

#### Variância, desvio padrão e coeficiente de variação

In [98]:
dados_impar = np.array([150, 151, 152, 152, 153, 154, 155, 155, 155])

##### Cálculo Manual

In [99]:
media = dados_impar.sum() / len(dados_impar)
media

153.0

In [100]:
desvio = abs(dados_impar - media)
desvio

array([3., 2., 1., 1., 0., 1., 2., 2., 2.])

In [101]:
desvio = desvio ** 2
desvio

array([9., 4., 1., 1., 0., 1., 4., 4., 4.])

In [102]:
soma_desvio = desvio.sum()
soma_desvio

28.0

In [103]:
v = soma_desvio / len(dados_impar)
v

3.111111111111111

In [104]:
dp = math.sqrt(v)
dp

1.7638342073763937

In [105]:
cv = (dp / media) * 100
cv

1.1528328152786886

In [106]:
def get_variancia_desvio_padrao_coeficiente(dataset):
  media = dataset.sum() / len(dataset)
  desvio = abs(dados_impar - media)
  desvio = desvio ** 2
  soma_desvio = desvio.sum()
  variancia = soma_desvio / len(dados_impar)
  dp = math.sqrt(variancia)
  return variancia, dp, (dp / media) * 100

In [107]:
get_variancia_desvio_padrao_coeficiente(dados_impar)

(3.111111111111111, 1.7638342073763937, 1.1528328152786886)