In [1]:
# !pip install pandas numpy seaborn scipy pingouin openpyxl

In [2]:
import pandas as pd
import numpy as np
import seaborn.objects as so
import seaborn as sns
from scipy import stats
import math
import pingouin as pg

In [3]:
xl_path = 'Lista de Exercicios - Complementaresxlsx Portugues.xlsx'

# Lista de Exercícios

## Exercício 1

In [4]:
def freq(df: pd.DataFrame, col: str, sort_index=False) -> pd.DataFrame:
    res = (
        df.loc[:,col]
        .value_counts()
        .to_frame(name='cnt')
        .pipe(lambda df: df.sort_index() if sort_index else df)
        .assign(
            relative_freq=lambda df: df.cnt.div(df.cnt.sum()).mul(100),
            cumcnt=lambda df: df.cnt.cumsum(),
            cum_relative_freq=lambda df: df.cumcnt.div(df.cnt.sum()).mul(100),
        )
    )
    return res

In [5]:
(
    pd.read_excel(
        xl_path,
        sheet_name='Exercício 1',
        usecols=[0,1],
        nrows=50,
    )
    .assign(
        bins=lambda df: df['Renda (R$)'].pipe(pd.cut,bins=[0,2000,4000,6000,8000,10000,12000]),
    )
    .pipe(freq,col='bins',sort_index=True)
)

Unnamed: 0_level_0,cnt,relative_freq,cumcnt,cum_relative_freq
bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0, 2000]",9,18.0,9,18.0
"(2000, 4000]",19,38.0,28,56.0
"(4000, 6000]",11,22.0,39,78.0
"(6000, 8000]",5,10.0,44,88.0
"(8000, 10000]",4,8.0,48,96.0
"(10000, 12000]",2,4.0,50,100.0


---

## Exercício 2

In [6]:
def describe_more(ser: pd.Series, **kwargs) -> pd.Series:
    res = (    
        ser.describe(**kwargs)
        .to_frame()
        .T
        .assign(
            amplitude=ser.max()-ser.min(),
            var=ser.var(),
            stderr=ser.sem(),
            cov=(ser.std()/ser.mean())*100,
            mode=ser.mode().values if len(ser.mode().values) == 1 else np.nan,
            skew=ser.skew(),
            kurtosis=ser.kurt(),
        )
        .T
        .iloc[:,0]
    )
    return res


In [7]:
exerc2_df = (
    pd.read_excel(
        xl_path,
        sheet_name='Exercício 2',
        usecols=[0,1,2],
        nrows=24,
        index_col=0,
    )
)

In [8]:
(
    exerc2_df.T
    .groupby(level=0)
    .apply(lambda df: df.iloc[0].pipe(describe_more,percentiles=[.25,.75,.8,.9,.27,.64]))
    .T
    .round(4)
)

Unnamed: 0,Ação 1,Ação 2
count,23.0,23.0
mean,0.0352,0.1511
std,0.1292,0.2038
min,-0.2018,-0.149
25%,-0.0612,0.0204
27%,-0.0564,0.0208
50%,0.037,0.1248
64%,0.0763,0.1825
75%,0.116,0.2333
80%,0.126,0.2619


$ \displaystyle t=\frac{r}{\sqrt{(1-r^2)/(n-2)}} $

In [9]:
def ttest_corr(r, n):
    return r/np.sqrt((1-r**2)/(n-2))

In [10]:
i, j = exerc2_df.shape
ddof = i-j
r, pvalue = stats.pearsonr(*exerc2_df.T.to_numpy())
T = ttest_corr(r, exerc2_df.count().iloc[0])
critical05 = stats.t.isf(.05/2, ddof)  # 2-tail, thus we divide by 2

In [11]:
print(f'''r: {r:.4f}
T: {T:.4f}
pvalue: {pvalue:.4f}
Critical (.05): {critical05:.4f}
''')

r: 0.2908
T: 1.3929
pvalue: 0.1782
Critical (.05): 2.0796



---

## Exercício 3

In [12]:
pdist = stats.binom(n=10,p=1/6)

a) P(k=4)

In [13]:
kle4 = pdist.pmf(k=4)

In [14]:
print(f'{kle4*100:.4f}%')

5.4266%


b) P(k>=7)

In [15]:
kge7 = pdist.sf(6)  # ge7 == gt6

In [16]:
print(f'{kge7*100:.4f}%')

0.0268%


---

## Exercício 4

P(k=8) | P(n=12), onde k é o número de falhas e n o número de sucessos

In [17]:
print(f'{stats.nbinom.pmf(k=8,n=12,p=3/5)*100:.4f}%')

10.7823%


---

## Exercício 5

Queremos saber a probabilidade de que 28 sejam tratados na próxima semana, dado que 3 são tratados diariamente. Isso significa que o nosso intervalo de tempo de interesse é de 7 dias, ou seja qual a probabilidade na semana inteira, ao invés de apenas um dia. A probabilidade de um evento ocorrer durante a semana inteira é intuitivamente menor do que em apenas um dia, dadas as mesmas proporções relativas (um aumento de 1/3 na demanda).

In [18]:
print(f'{stats.poisson.pmf(k=28,mu=3*7)*100:.4f}%')

2.6171%


---

## Exercício 6

In [19]:
dist = stats.norm(loc=26.5, scale=4)  # `loc` is mu and `scale` is sigma

a) P(x>37)

In [20]:
print(f'{dist.sf(37):.4f}')

0.0043


b) P(x<20)

In [21]:
print(f'{dist.cdf(20):.4f}')

0.0521


c) P(22<x<28)

In [22]:
P = dist.cdf(28)-dist.cdf(22)

In [23]:
print(f'{P:.4f}')

0.5159


## Exercício 7

In [24]:
exerc7_df = (
    pd.read_excel(
        xl_path,
        sheet_name='Exercício 7',
        usecols=[1,2,3],
        nrows=1,
    )
    .T
    .rename(columns={0: 'f_obs'})
)
exerc7_df

Unnamed: 0,f_obs
Livro A,29
Livro B,15
Livro C,16


In [25]:
alpha = .05
ddof = exerc7_df.f_obs.size-1
# stats.chisquare assumes same frequency for every category if f_exp is not passed
chi2val, pvalue = stats.chisquare(exerc7_df.f_obs)
critical05 = stats.chi2.isf(alpha, ddof)

In [26]:
print(f'''χ²: {chi2val:.3f}
pvalue: {pvalue:.4f}
Critical (.05): {critical05:.3f}
''')

χ²: 6.100
pvalue: 0.0474
Critical (.05): 5.991



---

## Exercício 8

In [27]:
def ddof(df: pd.DataFrame) -> pd.Series:
    return df.count()-1

F-test of equality of variances

$ F={\frac {S_{X}^{2}}{S_{Y}^{2}}} $, onde $ S^2 $ é a variância amostral

In [28]:
def ftest_eqvar(df: pd.DataFrame, alpha: float) -> tuple[float, float, float]:
    dfn, dfd = ddof(df)
    varx, vary = df.var()
    fvar = varx / vary
    pvalue = stats.f.sf(fvar, dfn, dfd)
    crit = stats.f.isf(alpha, dfn, dfd)
    return fvar, pvalue, crit

In [29]:
exerc8_df = (
    pd.read_excel(
        xl_path,
        sheet_name='Exercício 8',
        usecols=[0,1],
        nrows=14,
    )
)

In [30]:
exerc8_df.agg(['mean','var','count',ddof])

Unnamed: 0,Local A (mm),Local B (mm)
mean,6.285714,6.85
var,11.604396,4.202692
count,14.0,14.0
ddof,13.0,13.0


In [31]:
fvar, pvalue, critical05 = ftest_eqvar(exerc8_df, alpha=.05)

In [32]:
print(f'''F var: {fvar:.3f}
pvalue: {pvalue:.5f}
Critical (.05): {critical05:.3f}
''')

F var: 2.761
pvalue: 0.03917
Critical (.05): 2.577



---

## Exercício 9

In [33]:
exerc9_df = (
    pd.read_excel(
        xl_path,
        sheet_name='Exercício 9',
        usecols=[0,1],
        nrows=20,
    )
)

In [34]:
exerc9_df.agg(['mean','var','count',ddof])

Unnamed: 0,Hospital 1,Hospital 2
mean,81.5,57.8
var,137.736842,96.8
count,20.0,20.0
ddof,19.0,19.0


In [35]:
fvar, pvalue, critical01 = ftest_eqvar(exerc9_df, alpha=.01)

In [36]:
print(f'''F var: {fvar:.3f}
pvalue: {pvalue:.3f}
Critical (.01): {critical01:.3f}
''')

F var: 1.423
pvalue: 0.225
Critical (.01): 3.027



In [37]:
(
    pg.ttest(*exerc9_df.T.to_numpy(),confidence=.99)
    .assign(
        critical01=lambda df: stats.t.isf(.01/2, df.dof),
    )
)

Unnamed: 0,T,dof,alternative,p-val,CI99%,cohen-d,BF10,power,critical01
T-test,6.920824,38,two-sided,3.147828e-08,"[14.414412784845942, 32.98558721515406]",2.188557,293000.0,0.999999,2.711558


In [38]:
T, pvalue = stats.ttest_ind(*exerc9_df.T.to_numpy())

In [39]:
critical01 = stats.t.isf(.01/2, 38)

In [40]:
stats.t.interval(confidence=.99, df=19, loc=exerc9_df['Hospital 1'].mean(), scale=exerc9_df['Hospital 1'].sem())

(73.99210839145069, 89.00789160854931)

In [41]:
stats.t.interval(confidence=.99, df=19, loc=exerc9_df['Hospital 2'].mean(), scale=exerc9_df['Hospital 2'].sem())

(51.50594386581019, 64.09405613418981)

---

## Exercício 10

$ \displaystyle t = \frac{\bar{x} - \mu _{0}}{s/\sqrt{n}} $

In [42]:
T = (
    (65-60)/
    (3.5/math.sqrt(36))
)

In [43]:
alpha = .05
ddof = 36-1
pvalue = stats.t.sf(T,ddof)*2  # 2-tail, so we multiply by two
critical05 = stats.t.isf(alpha/2, ddof)

In [44]:
print(f'''T: {T:.3f}
pvalue: {pvalue:.3f}
Critical (.05): {critical05:.3f}
''')

T: 8.571
pvalue: 0.000
Critical (.05): 2.030



---

## Exercício 11

$
\displaystyle Z = \frac{(\bar{x} - \mu_)}{\sigma/\sqrt{n}}
$

In [45]:
Z = (
    (8.25 - 8)/
    (1/math.sqrt(85))
)

In [46]:
alpha = .05
pvalue = stats.norm.sf(Z)  # single-tail
critical05 = stats.norm.isf(alpha)

In [47]:
print(f'''Z: {Z:.3f}
pvalue: {pvalue:.3f}
Critical (.05): {critical05:.3f}
''')

Z: 2.305
pvalue: 0.011
Critical (.05): 1.645



---

## Exercício 12

$ \displaystyle (\bar{x}-\mu) = Z \times \frac{\sigma}{\sqrt{n}} $

In [48]:
alpha = .05
critical05 = stats.norm.isf(alpha/2)  # 2-tail so we divide alpha by 2
meansubmu = critical05 * (10/math.sqrt(425))
estimated_upper = 226 + meansubmu
estimated_lower = 226 - meansubmu

In [49]:
print(f'''Critical Z (.05): {critical05:.3f}
Estimated upper: {estimated_upper:.2f}
Estimated lower: {estimated_lower:.2f}
''')

Critical Z (.05): 1.960
Estimated upper: 226.95
Estimated lower: 225.05



In [50]:
stats.norm.interval(1-alpha, loc=226, scale=(10/math.sqrt(425)))

(225.04927781992174, 226.95072218007826)