# Pruebas de hipotesis y validación

## Importar librerias

In [43]:
import numpy as np
import pandas as pd
import random
import io

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as st
from scipy.stats import f_oneway
from scipy.stats import ttest_ind

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

## Cargar base de datos

![petalo-sepalo](img/petalo-sepalo.jpg)

In [3]:
df = pd.read_csv('data/iris.csv')

df.drop(axis=1, columns='Id', inplace=True)

nuevos_nombres = {
    'SepalLengthCm' : 'sepal_length_cm', 
    'SepalWidthCm' : 'sepal_width_cm', 
    'PetalLengthCm' : 'petal_length_cm', 
    'PetalWidthCm': 'petal_width_cm',
    'Species' : 'species'
}

df = df.rename(columns=nuevos_nombres)

df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Tipos de pruebas

### Distribución t de Student

Standard error

$$SE = \frac{\sigma }{\sqrt{N}}

In [5]:
se_length = df['sepal_length_cm'].std()/np.sqrt(10)
se_width = df['sepal_width_cm'].std()/np.sqrt(10)

print(se_length)
print(se_width)

0.2618575017646526
0.13711456043966944


Standard error diviation

$$SED = \sqrt{\frac{\sigma_{1}^{2}}{N_{1}}+\frac{\sigma_{1}^{2}}{N_{2}}}

In [6]:
sed = np.sqrt((se_length**2)+(se_width**2))
print(sed)

0.2955837511010861


t

$$ t = \frac{(\bar{x}_{1}-\bar{x}_{2})}{SED}

In [9]:
t_stat = (df['sepal_length_cm'].mean()-df['sepal_width_cm'].mean())/sed
print(t_stat)

9.436693738890318


In [11]:
p = ttest_ind(df['sepal_length_cm'], df['sepal_width_cm'])
print(p)

TtestResult(statistic=36.548157693982006, pvalue=3.987838114848222e-112, df=298.0)


### Correlación y coeficiente de Pearson

In [12]:
df['sepal_length_cm'].corr(df['sepal_width_cm'])

-0.10936924995064937

In [14]:
st.pearsonr(df['sepal_length_cm'], df['sepal_width_cm'])

PearsonRResult(statistic=-0.10936924995064937, pvalue=0.18276521527136963)

### Analisis de la varianza (ANOVA)

In [15]:
f_oneway(df['sepal_length_cm'],df['sepal_width_cm'])

F_onewayResult(statistic=1335.7678308241748, pvalue=3.98783811484836e-112)

## Bootstraping

In [18]:
data = np.random.normal(loc=34, size=10000) # poblacion de 10.000 con una media de 34 años
data

array([35.35677287, 34.98622049, 33.38984314, ..., 34.70915942,
       34.3266351 , 33.21995523])

In [19]:
data.mean()

34.002529442115424

### Calculando el promedio de edades

In [20]:
promedio = []
for i in range(40):
    muestra = random.sample(data.tolist(),5)
    prom = np.mean(muestra)
    promedio.append(prom)

In [21]:
promedio

[34.957908245695435,
 34.704098568466996,
 34.52901144817568,
 33.980501014134134,
 33.820296521616584,
 33.67454432488108,
 34.041918934055616,
 34.678781689323905,
 33.36352854665156,
 34.52980716548189,
 34.001565283856394,
 33.56652607814239,
 34.243853960254924,
 34.290103548241106,
 34.45090265516767,
 34.483381606566624,
 33.65456462662435,
 34.300028152077495,
 34.041530007643686,
 33.07977535815483,
 34.33096241107127,
 34.28292545502459,
 34.08245741988715,
 34.4823728143029,
 34.138281964925966,
 33.88853884532914,
 34.34608360536026,
 33.71106646513199,
 33.71951957826956,
 33.94628204125918,
 33.994936764495606,
 33.675450923257166,
 34.16397659432049,
 33.48723201164023,
 34.75248661604187,
 34.200535687399245,
 33.558046955275955,
 34.42683445055657,
 34.72516773589284,
 34.46066863570788]

In [22]:
np.mean(promedio)

34.119161367759055

## Validación cruzada

### Cargar base de datos

In [29]:
df = pd.read_csv('data/breast-cancer-wisconsin.csv')
df.drop(axis=1, columns='Unnamed: 32', inplace=True)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [31]:
X = df.iloc[:,2:]
Y = df.iloc[:,1]

In [32]:
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [33]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   radius_mean              569 non-null    float64
 1   texture_mean             569 non-null    float64
 2   perimeter_mean           569 non-null    float64
 3   area_mean                569 non-null    float64
 4   smoothness_mean          569 non-null    float64
 5   compactness_mean         569 non-null    float64
 6   concavity_mean           569 non-null    float64
 7   concave points_mean      569 non-null    float64
 8   symmetry_mean            569 non-null    float64
 9   fractal_dimension_mean   569 non-null    float64
 10  radius_se                569 non-null    float64
 11  texture_se               569 non-null    float64
 12  perimeter_se             569 non-null    float64
 13  area_se                  569 non-null    float64
 14  smoothness_se            5

In [34]:
Y.value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

In [38]:
# Partiendo entre train y tes
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=4)

In [40]:
# Modelo
modelo = DecisionTreeClassifier()
# Ajuste del modelo
modelo.fit(X_train, Y_train)
# Precisioòn del modelo en %
resultado = modelo.score(X_test, Y_test)
print(resultado)

0.9005847953216374


### Validación cruzada k fold

In [42]:
# Modelo
modelo = DecisionTreeClassifier()
# Muestras
kfold_validacion = KFold(10)

In [44]:
# Resultados
resultados = cross_val_score(modelo, X, Y, cv=kfold_validacion)
print(resultados)
print(resultados.mean())

[0.94736842 0.9122807  0.9122807  0.94736842 0.92982456 0.98245614
 0.9122807  0.96491228 0.92982456 0.89285714]
0.9331453634085213
