In [None]:
# To use only Google Colab
# ! pip install matplotlib --upgrade

# Unidad II. Regresiones y reducción de dimensionalidad.

## Independencia de variables y medidas de asociación.

- Distribución conjunta de variables aleatorias.
 - Tablas de Contingencia.
 - Concepto de independencia.

Distribución conjunta (o *multivariada*)
- la distribución de la intersección dos o más variables aleatorias.
  - para dos variables: *distribución bivariada*
- En el caso de dos variables aleatorias $X$ e $Y$:
  - $P(X=x, Y=y)$
  - $P(X, Y)$
- Se debe cumplir:
  - $\sum_{i}\sum_{j} P(X=x_{i}, Y=y_{j}) = 1$

Probabilidad condicional:
- Probabilidad de que:
  - una de las variables tome un valor
  - dado que el valor otra variable haya sido fijada
- $P(x|y)$
  - Es la probabilidad de $x$ dado $y$.
  - Donde el valor de $y$ está fijado.
- Se cumple que:
  - $P(X=x, Y=y) = P(X=x|Y=y) \cdot P(Y=y)$
  - $P(X=x, Y=y) = P(Y=y|X=x) \cdot P(X=x)$

Variables independientes:
- Si $X$ e $Y$ son variables **independientes**, se cumple que:
- $P(X=x, Y=y) = P(X=x) \cdot P(Y=y)$
  - Es decir:
    - P(X=x|Y=y) = P(X=x)
    - P(Y=y|X=x) = P(Y=y)

Probabilidades marginales:
- Distribución de probabilidades de un subconjunto de las variables aleatorias.
- Para distribuciones conjuntas bivariadas:
  - P(X=x) (o P(Y=y))
  - $ P(X=x) = \sum_{y} P(X=x, Y=y)$


Las definiciones anteriores son para dos variables categóricas.
- Se extiende para variables continuas
- donde la función de densidad de probabilidad (*PDF*) conjunta se define como:



### Tablas de Contingencia

- Representan todas las combinaciones de valores posibles para
  - un determinado número de variables categóricas.
  - Lo más frecuente es representar dos variables.
    - Pero podrían representarse más.
- Pueden contener valores de probabilidad
  - pero también se usarse con frecuencias.

| Y \ X | x1 | x2 | x3 | P(Y=y) |
| ---   | ---| ---| ---| ---    |
| y1    |  a |  b |  c | a+b+c  |
| y2    |  d |  e |  f | d+b+f  |
| P(X=x)|a+d | d+e| c+f| total  |


### Construir tablas de contingencia a partir de los datos

In [6]:
import pandas as pd

data = [
    ["DP00004", "P49913", "Cathelicidin antimicrobial peptide", "Homo sapiens", "170", "No"],
    ["DP00007", "P27695", "DNA-(apurinic or apyrimidinic site) lyase", "Homo sapiens", "318", "No"],
    ["DP00011", "P0DMM9", "Sulfotransferase 1A3", "Homo sapiens", "295", "No"],
    ["DP00012", "P13569", "Cystic fibrosis transmembrane conductance regulator", "Homo sapiens", "1480", "No"],
    ["DP00013", "P0DN86", "Choriogonadotropin subunit beta 3", "Homo sapiens", "165", "No"],
    ["DP00016", "P38936", "Cyclin-dependent kinase inhibitor 1", "Homo sapiens", "164", "Si"],
    ["DP00017", "P49918", "Cyclin-dependent kinase inhibitor 1C", "Homo sapiens", "316", "Si"],
    ["DP00018", "P46527", "Cyclin-dependent kinase inhibitor 1B", "Homo sapiens", "198", "Si"],
    ["DP00023", "P14061", "Estradiol 17-beta-dehydrogenase 1", "Homo sapiens", "328", "No"],
    ["DP00028", "Q13541", "Eukaryotic translation initiation factor 4E-binding protein 1", "Homo sapiens", "118", "Si"],
    ["DP00030", "P04150", "Glucocorticoid receptor", "Homo sapiens", "777", "No"],
    ["DP00033", "P10912", "Growth hormone receptor", "Homo sapiens", "638", "No"],
    ["DP00039", "P05204", "Non-histone chromosomal protein HMG-17", "Homo sapiens", "90", "Si"],
    ["DP00040", "P17096", "High mobility group protein HMG-I/HMG-Y", "Homo sapiens", "107", "Si"],
    ["DP00054", "P78356", "Phosphatidylinositol 5-phosphate 4-kinase type-2 beta", "Homo sapiens", "416", "No"],
    ["DP00061", "P27694", "Replication protein A 70 kDa DNA-binding subunit", "Homo sapiens", "616", "No"],
    ["DP00062", "P19793", "Retinoic acid receptor RXR-alpha", "Homo sapiens", "462", "No"],
    ["DP00069", "P63027", "Vesicle-associated membrane protein 2", "Homo sapiens", "116", "No"],
    ["DP00070", "P37840", "Alpha-synuclein", "Homo sapiens", "140", "Si"],
    ["DP00072", "Q8WZ42", "Titin", "Homo sapiens", "34350", "Si"],
    ["DP00043", "P02929", "Protein TonB", "Escherichia coli (strain K12)", "239", "No"],
    ["DP00088", "P0ABI8", "Cytochrome bo(3) ubiquinol oxidase subunit 1", "Escherichia coli (strain K12)", "663", "No"],
    ["DP00089", "P0ABJ1", "Cytochrome bo(3) ubiquinol oxidase subunit 2", "Escherichia coli (strain K12)", "315", "No"],
    ["DP00100", "P0A6H5", "ATP-dependent protease ATPase subunit HslU", "Escherichia coli (strain K12)", "443", "No"],
    ["DP00103", "P09372", "Protein GrpE", "Escherichia coli (strain K12)", "197", "No"],
    ["DP00107", "P69924", "Ribonucleoside-diphosphate reductase 1 subunit beta", "Escherichia coli (strain K12)", "376", "No"],
    ["DP00140", "P0A7L8", "50S ribosomal protein L27", "Escherichia coli (strain K12)", "85", "Si"],
    ["DP00145", "P0A7S3", "30S ribosomal protein S12", "Escherichia coli (strain K12)", "124", "No"],
    ["DP00146", "P0A7T7", "30S ribosomal protein S18", "Escherichia coli (strain K12)", "75", "Si"],
    ["DP00147", "P0A7U3", "30S ribosomal protein S19", "Escherichia coli (strain K12)", "92", "No"],
    ["DP00161", "P77173", "Cell division protein ZipA", "Escherichia coli (strain K12)", "328", "No"],
    ["DP00190", "P38038", "Sulfite reductase [NADPH] flavoprotein alpha-component", "Escherichia coli (strain K12)", "599", "No"],
    ["DP00194", "P0AFZ3", "Stringent starvation protein B", "Escherichia coli (strain K12)", "165", "No"],
    ["DP00197", "P0A707", "Translation initiation factor IF-3", "Escherichia coli (strain K12)", "180", "No"],
    ["DP00207", "P21513", "Ribonuclease E", "Escherichia coli (strain K12)", "1061", "No"],
    ["DP00242", "P0AG63", "30S ribosomal protein S17", "Escherichia coli (strain K12)", "84", "Si"],
    ["DP00252", "P0A877", "Tryptophan synthase alpha chain", "Escherichia coli (strain K12)", "268", "No"],
    ["DP00299", "P0AE70", "Endoribonuclease toxin MazF", "Escherichia coli (strain K12)", "111", "No"],
    ["DP00301", "P0ABQ4", "Dihydrofolate reductase", "Escherichia coli (strain K12)", "159", "No"],
    ["DP00337", "P06968", "Deoxyuridine 5'-triphosphate nucleotidohydrolase", "Escherichia coli (strain K12)", "152", "No"],
]

df = pd.DataFrame(
    data = data,
    columns = ["DisprotID", "UniprotId", "Name", "Organism", "length", "Fully Disordered"]
)
df

Unnamed: 0,DisprotID,UniprotId,Name,Organism,length,Fully Disordered
0,DP00004,P49913,Cathelicidin antimicrobial peptide,Homo sapiens,170,No
1,DP00007,P27695,DNA-(apurinic or apyrimidinic site) lyase,Homo sapiens,318,No
2,DP00011,P0DMM9,Sulfotransferase 1A3,Homo sapiens,295,No
3,DP00012,P13569,Cystic fibrosis transmembrane conductance regu...,Homo sapiens,1480,No
4,DP00013,P0DN86,Choriogonadotropin subunit beta 3,Homo sapiens,165,No
5,DP00016,P38936,Cyclin-dependent kinase inhibitor 1,Homo sapiens,164,Si
6,DP00017,P49918,Cyclin-dependent kinase inhibitor 1C,Homo sapiens,316,Si
7,DP00018,P46527,Cyclin-dependent kinase inhibitor 1B,Homo sapiens,198,Si
8,DP00023,P14061,Estradiol 17-beta-dehydrogenase 1,Homo sapiens,328,No
9,DP00028,Q13541,Eukaryotic translation initiation factor 4E-bi...,Homo sapiens,118,Si


In [10]:
contin_df = pd.crosstab(df["Organism"], df["Fully Disordered"])
contin_df

Fully Disordered,No,Si
Organism,Unnamed: 1_level_1,Unnamed: 2_level_1
Escherichia coli (strain K12),17,3
Homo sapiens,12,8


In [17]:
prob_df = contin_df / contin_df.sum().sum()
prob_df

Fully Disordered,No,Si
Organism,Unnamed: 1_level_1,Unnamed: 2_level_1
Escherichia coli (strain K12),0.425,0.075
Homo sapiens,0.3,0.2


In [19]:
margin_disorder = prob_df.sum()
margin_disorder

Fully Disordered
No    0.725
Si    0.275
dtype: float64

In [26]:
margin_org = prob_df.sum(axis=1)
margin_org

Organism
Escherichia coli (strain K12)    0.5
Homo sapiens                     0.5
dtype: float64

In [8]:
import scipy.stats as st

st.contingency.crosstab(df["Organism"], df["Fully Disordered"])

((array(['Escherichia coli (strain K12)', 'Homo sapiens'], dtype=object),
  array(['No', 'Si'], dtype=object)),
 array([[17,  3],
        [12,  8]]))