# TP2 - HMLA408: Joseph Salmon

## Question: importer la fonction chisquare

In [39]:
from scipy.stats import chisquare, norm, shapiro, poisson, chi2, chi2_contingency
chisquare?

## Question: cobayes

In [2]:
import numpy as np
cobaye = np.array([33, 13, 15, 3])
mendel = np.array([9/16, 3/16, 3/16, 1/16])

In [3]:
a, b = chisquare(cobaye, f_exp=mendel * np.sum(cobaye))
print(a, b)

1.3333333333333333 0.7212333746277604


In [4]:
from scipy.stats import chi2
np.testing.assert_almost_equal(1-chi2.cdf(a, 3), b) # exemple testing that two quantities are numerically equal

## Question: importer pandas

In [5]:
import pandas as pd

In [6]:
echantillon_gaussien = np.random.randn(50)
bins = np.array([-np.inf, -2, -1, 0, 1, 2, np.inf])
print(echantillon_gaussien)

[-8.45353363e-02 -1.20453068e+00  4.67508562e-01 -8.56165014e-01
  1.75023856e-01 -4.71218710e-01  1.66297979e+00 -2.84143870e-01
  3.02714884e+00  1.10995719e-01  4.02821056e-01 -1.52331744e+00
 -1.32487561e+00 -6.63346037e-01  6.28787703e-01 -8.05227229e-02
 -1.20109924e+00  7.85654741e-01  4.16634404e-01  1.11943312e+00
 -1.23979086e+00  2.50002571e-02 -4.16023466e-01  2.28697882e-03
  4.44423879e-01  5.79095777e-01 -3.81399280e-01 -2.60473797e-01
 -2.10280990e-01  4.06500734e-01  1.27071130e+00 -7.21214054e-01
 -9.85972533e-01  1.97904153e+00  3.61205496e-01 -6.10407210e-01
  5.85580433e-01 -4.98272396e-02  1.03327921e+00  1.19576268e+00
 -6.02099318e-01  1.15253577e+00 -1.78012401e+00  3.16170906e-01
  4.03232190e-01 -8.11766472e-01  1.81017410e+00 -3.59488084e-01
  4.05680695e-01  5.12445916e-01]


In [7]:
echantillon_regroupe = pd.cut(echantillon_gaussien,bins=bins)  # jamais d'accent dans les noms de variables
echantillon_regroupe[22]

Interval(-1.0, 0.0, closed='right')

In [8]:
echantillon_regroupe.value_counts()

(-inf, -2.0]     0
(-2.0, -1.0]     6
(-1.0, 0.0]     17
(0.0, 1.0]      18
(1.0, 2.0]       8
(2.0, inf]       1
dtype: int64

In [9]:
pd.value_counts(echantillon_regroupe)

(0.0, 1.0]      18
(-1.0, 0.0]     17
(1.0, 2.0]       8
(-2.0, -1.0]     6
(2.0, inf]       1
(-inf, -2.0]     0
dtype: int64

In [10]:
effectifs_classes = echantillon_regroupe.value_counts()

## Question: fonction de répartition


In [11]:
norm.cdf(0)

0.5

In [12]:
norm.cdf(1.96)

0.9750021048517795

In [13]:
p1 = norm.cdf(bins[1])
p2 = norm.cdf(bins[2]) - norm.cdf(bins[1])
p3 = norm.cdf(bins[3]) - norm.cdf(bins[2])
p4 = norm.cdf(bins[4]) - norm.cdf(bins[3])
p5 = norm.cdf(bins[5]) - norm.cdf(bins[4])
p6 = 1 - norm.cdf(bins[5])
probas = np.array([p1, p2, p3, p4, p5, p6])
print(probas)
np.diff(norm.cdf(bins))

[0.02275013 0.13590512 0.34134475 0.34134475 0.13590512 0.02275013]


array([0.02275013, 0.13590512, 0.34134475, 0.34134475, 0.13590512,
       0.02275013])

## Question: test du $\chi^2$

In [14]:
np.diff(norm.cdf(bins)) * echantillon_regroupe.shape[0]

array([ 1.1375066,  6.7952561, 17.0672373, 17.0672373,  6.7952561,
        1.1375066])

In [15]:
chisquare(effectifs_classes, np.diff(norm.cdf(bins)) * echantillon_regroupe.shape[0])
# on ne rejette pas l'hypothèse aux seuil 1, 5 10 %...pvalue=0.14

Power_divergenceResult(statistic=1.5120324426079348, pvalue=0.9116771108453586)

# Question: reprise du test avec d'autres intervalles (plus larges effectifs)

In [16]:
echantillon_gaussien = np.random.randn(100)
bins = np.array([-np.inf, -1.5, -0.5, 0.5, 1.5, np.inf])
echantillon_regroupe = pd.cut(echantillon_gaussien,bins=bins)  # jamais d'accent dans les noms de variables
effectifs_classes = echantillon_regroupe.value_counts()
chisquare(effectifs_classes, np.diff(norm.cdf(bins)) * echantillon_regroupe.shape[0])

Power_divergenceResult(statistic=6.9032883209059275, pvalue=0.1410881145003066)

# Question: Shapiro-Wilk
La loi de la statistique de test sous H0: "la distribution de l'échantillon est gaussienne"
est calculable par Python et on obtient directement la statistique associée et la p-value.

In [17]:
shapiro(echantillon_gaussien)
# ici par exemple p-value 0.68, donc on ne rejette pas H0 (ce qui est attendu vu que nos données sont bien Gaussiennes) 

(0.9854817390441895, 0.3438335061073303)

In [18]:
echantillon_exponential = np.random.exponential(1,100)
shapiro(echantillon_exponential)
# Ici rejet clair!

(0.849018931388855, 1.0911315406758604e-08)

## Question: Download

In [19]:
from download import download
url = "http://josephsalmon.eu/enseignement/datasets/hcmv.data"
# url = "http://www.stat.berkeley.edu/users/statlabs/data/hcmv.data" # backup url, without header.
path_target = "./hcmv.data"
download(url, path_target, replace=False)

Replace is False and data exists, so doing nothing. Use replace==True to re-download the data.


'./hcmv.data'

In [20]:
df_hmcv = pd.read_csv("hcmv.data", sep='\s+') # \s+ : for hanlding spaces
df_hmcv.head(n=10)  # df stands for Data Frame

Unnamed: 0,location
0,177
1,1321
2,1433
3,1477
4,3248
5,3255
6,3286
7,7263
8,9023
9,9084


In [21]:
n_basis = 229354
n_palindromes = df_hmcv.count()['location']
lambda_hat = n_palindromes / n_basis * 4000
n_palindromes

296

In [22]:
new_bins = np.concatenate([[-np.inf],np.arange(1, n_basis, step=4000),[np.inf]])

In [23]:
new_bins

array([       -inf, 1.00000e+00, 4.00100e+03, 8.00100e+03, 1.20010e+04,
       1.60010e+04, 2.00010e+04, 2.40010e+04, 2.80010e+04, 3.20010e+04,
       3.60010e+04, 4.00010e+04, 4.40010e+04, 4.80010e+04, 5.20010e+04,
       5.60010e+04, 6.00010e+04, 6.40010e+04, 6.80010e+04, 7.20010e+04,
       7.60010e+04, 8.00010e+04, 8.40010e+04, 8.80010e+04, 9.20010e+04,
       9.60010e+04, 1.00001e+05, 1.04001e+05, 1.08001e+05, 1.12001e+05,
       1.16001e+05, 1.20001e+05, 1.24001e+05, 1.28001e+05, 1.32001e+05,
       1.36001e+05, 1.40001e+05, 1.44001e+05, 1.48001e+05, 1.52001e+05,
       1.56001e+05, 1.60001e+05, 1.64001e+05, 1.68001e+05, 1.72001e+05,
       1.76001e+05, 1.80001e+05, 1.84001e+05, 1.88001e+05, 1.92001e+05,
       1.96001e+05, 2.00001e+05, 2.04001e+05, 2.08001e+05, 2.12001e+05,
       2.16001e+05, 2.20001e+05, 2.24001e+05, 2.28001e+05,         inf])

In [24]:
counts_bin = [-np.inf, 2, 3, 4, 5, 6, 7, 8, np.inf]
# jamais d'accent dans les noms de variables
hmcv_regroupe = pd.cut(df_hmcv['location'], bins=new_bins)
counts_palindrome = hmcv_regroupe.value_counts(sort=False)
count_regroupe = pd.cut(counts_palindrome, counts_bin,
                        right=True).value_counts(sort=False)

In [25]:
count_regroupe

(-inf, 2.0]     9
(2.0, 3.0]      8
(3.0, 4.0]     10
(4.0, 5.0]      9
(5.0, 6.0]      8
(6.0, 7.0]      5
(7.0, 8.0]      4
(8.0, inf]      6
Name: location, dtype: int64

In [26]:
bins_poisson = [-np.inf, 2, 3, 4, 5, 6, 7, 8, np.inf]
count_regroupe = pd.cut(counts_palindrome, bins_poisson, right=True, labels=[
                             '0-2', '3', '4', '5', '6', '7', '8','9-']).value_counts(sort=False)

In [27]:
count_regroupe

0-2     9
3       8
4      10
5       9
6       8
7       5
8       4
9-      6
Name: location, dtype: int64

In [28]:
np.diff(poisson.cdf(bins_poisson, lambda_hat))

array([0.11162934, 0.13134557, 0.16951215, 0.17501537, 0.15058105,
       0.11104977, 0.07165938, 0.07920736])

In [29]:
poisson.cdf(bins_poisson, lambda_hat)

array([0.        , 0.11162934, 0.24297491, 0.41248705, 0.58750243,
       0.73808348, 0.84913325, 0.92079264, 1.        ])

In [30]:
np.diff(poisson.cdf(bins_poisson, lambda_hat))

array([0.11162934, 0.13134557, 0.16951215, 0.17501537, 0.15058105,
       0.11104977, 0.07165938, 0.07920736])

In [31]:
count_theoric = np.diff(poisson.cdf(bins_poisson, lambda_hat)) * count_regroupe.sum()
chi2_stat, chi2_pvalue = chisquare(count_regroupe, count_theoric, ddof=1)
# Attention au nombre de degrés de libertés, ddof=1 ici car on ne connait pas le parametre lambda
print(chi2_stat, chi2_pvalue)

1.907642554439036 0.9279935984980819


# Question: $\chi^2$ à la main

In [32]:
pvalue_manual = 1-chi2.cdf(1.907642554439036,df=6)
print(pvalue_manual)

0.9279935984980819


In [33]:
np.testing.assert_almost_equal(pvalue_manual, chi2_pvalue) 

# Chargement de la base de données 

In [34]:
url = "http://josephsalmon.eu/enseignement/datasets/babies23.data"
path_target = "./babies23.data"
download(url, path_target, replace=False)

file_sizes:   0%|                                    | 0.00/144k [00:00<?, ?B/s]

Downloading data from http://josephsalmon.eu/enseignement/datasets/babies23.data (141 kB)



file_sizes: 100%|█████████████████████████████| 144k/144k [00:00<00:00, 441kB/s]

Successfully downloaded file to ./babies23.data





'./babies23.data'

# Preoprocessing: only run once or big trouble (think about it!)

In [35]:
is_preprocessing_done = 0  # init at 0,if greater don't redo it
pd.options.display.max_rows = 8  # set not to display to many lines in pandas
pd.set_option('precision', 0)  # set to display number at precision 0 in pandas

# Lecture de la base de données et constructions d'un dataframe:

In [36]:
df_babies = pd.read_csv("babies23.data", skiprows=38, sep='\s+') # \s+ : for hanlding spaces
df_babies.head(n=10)  # df stands for Data Frame

Unnamed: 0,id,pluralty,outcome,date,gestation,sex,wt,parity,race,age,...,drace,dage,ded,dht,dwt,marital,inc,smoke,time,number
0,15,5,1,1411,284,1,120,1,8,27,...,8,31,5,65,110,1,1,0,0,0
1,20,5,1,1499,282,1,113,2,0,33,...,0,38,5,70,148,1,4,0,0,0
2,58,5,1,1576,279,1,128,1,0,28,...,5,32,1,99,999,1,2,1,1,1
3,61,5,1,1504,999,1,123,2,0,36,...,3,43,4,68,197,1,8,3,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,102,5,1,1449,244,1,138,4,7,33,...,7,37,4,99,999,1,98,0,0,0
7,129,5,1,1562,245,1,132,2,7,23,...,7,23,4,71,192,1,2,0,0,0
8,142,5,1,1408,289,1,120,3,0,25,...,3,26,1,70,180,0,2,0,0,0
9,148,5,1,1568,299,1,143,3,0,30,...,0,34,5,99,999,1,2,1,1,4


In [70]:
cont_table = pd.crosstab(df_babies['ed'], df_babies['smoke'])
cont_table

smoke,0,1,2,3,9
ed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,12,4,1,2,0
1,57,102,12,10,2
2,202,176,40,22,4
3,21,33,3,6,2
4,138,102,21,35,2
5,109,65,18,27,0
7,5,1,0,1,0
9,0,1,0,0,0


In [73]:
chi2, p_value, dof, ex = chi2_contingency(cont_table)

In [74]:
p_value
# on rejette donc l'hypothèse il y a une dépendance entre le niveau d'éducation et le fait de fumer sur ces données.

6.000912536921962e-05

In [75]:
chi2

66.30862239116085

In [76]:
dof

28

# Question: comparer a la main avec un degré de liberté bien choisi:

In [53]:
ex 

array([[8.36245955e+00, 7.44012945e+00, 1.46035599e+00, 1.58333333e+00,
        1.53721683e-01],
       [8.05436893e+01, 7.16601942e+01, 1.40655340e+01, 1.52500000e+01,
        1.48058252e+00],
       [1.95417476e+02, 1.73864078e+02, 3.41262136e+01, 3.70000000e+01,
        3.59223301e+00],
       [2.86084142e+01, 2.54530744e+01, 4.99595469e+00, 5.41666667e+00,
        5.25889968e-01],
       [1.31158576e+02, 1.16692557e+02, 2.29045307e+01, 2.48333333e+01,
        2.41100324e+00],
       [9.63883495e+01, 8.57572816e+01, 1.68325243e+01, 1.82500000e+01,
        1.77184466e+00],
       [3.08090615e+00, 2.74110032e+00, 5.38025890e-01, 5.83333333e-01,
        5.66343042e-02],
       [4.40129450e-01, 3.91585761e-01, 7.68608414e-02, 8.33333333e-02,
        8.09061489e-03]])

In [72]:
chisquare(cont_table.values.ravel(), f_exp=ex.ravel(), ddof=cont_table.size - 1 - dof)

Power_divergenceResult(statistic=66.30862239116085, pvalue=6.000912536921962e-05)

(8, 5)

Remarque: il se trouve que la loi du $\chi^2$ pour ce type de test est une distribution du $\chi^2_{(c-1)(l-1)}$
avec $c$ le nombre de colonnes, et $l$ le nombre de ligne du tableau.

28