# Gráficas de los datos

Presentar de manera clara los aspectos mas importantes de los datos

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
mainpath = "../datasets/" #carpeta global
filename = "titanic-kaggle/train.csv"
#filename = "customer-churn-model/Customer Churn Model.txt" #dataset
fullpath = os.path.join(mainpath, filename)
data = pd.read_csv (fullpath)
data.shape

In [None]:
data.columns.values 

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data['Sex'] = data["Sex"].replace(["female","male"],[0,1] )
data['Embarked'] = data['Embarked'].replace(["Q","C","S"],[1,2,3])
subset = data.drop(['Cabin','Name','Ticket'], axis=1)
#subset = subset.fillna(0)
subset = subset.dropna()
subset

In [None]:
subset.describe()

## Primera vista

In [None]:
subset.hist(bins=50, figsize=(20,15))
plt.show()

## Seaborn

https://seaborn.pydata.org/

## Regla de Sturges

Número de divisiones de un histograma

https://es.wikipedia.org/wiki/Regla_de_Sturges

In [None]:
k = int(np.ceil(1+np.log2(len(subset)))) #regla de sturges con un numero entero
k

In [None]:
sns.histplot(data=subset, x='Age', bins = k)

In [None]:
sns.scatterplot(x="Age",
                    y="Sex",
                    data=subset)

In [None]:
sns.pairplot(subset, hue='Survived', height=2.25)# hue separar por alguna categoría

In [None]:
sns.displot(subset, x="Age", hue="Survived", kind="kde") # Kernel density estimation (KDE) 

## Boxplot

In [None]:
sns.boxplot(x=subset["Age"])

## Cuantiles
![DIV](fig/boxplot.png)

In [None]:
percentages = [0.05, 0.25, 0.5, 0.75, 0.95]
df = pd.DataFrame(subset['Age'].quantile(percentages))
df.index = [f'{p * 100}%' for p in percentages]
print(df.transpose())

In [None]:
subset.describe()

## Rango intercuartilar (IQR)

In [None]:
# resta entre el primer y tercer cuartil es el IQR = rango intercuatilico (Q3-Q1=IQR)
IQR=subset["Age"].quantile(0.75)-subset["Age"].quantile(0.25)
print("El rango intercuartilar (IQR) es:", IQR)

## Barras de error

In [None]:
low_bound = subset["Age"].quantile(0.25) - (1.5*IQR) #inferior
upper_bound = subset["Age"].quantile(0.75) + 1.5*IQR #superior
print("El límite superior es:", upper_bound)
print("El límite inferior es:", low_bound)

In [None]:
(subset['Age']).plot.box(vert=False,figsize=(5, 5))
plt.text(1,1.4,'La caja son los datos entre 25, 50 y 75 %')
plt.text(1,1.3,'Los valores atípicos son los círculos')
plt.text(0.2,0.85,"inferior",color='r')
plt.text(0.2,0.9,low_bound,color='r')
plt.text(60,0.85,"superior",color='r')
plt.text(60,0.9,upper_bound,color='r')
plt.text(15,1.15,"Q1",color='b')
plt.text(15,1.1,subset["Age"].quantile(0.25),color='b')
plt.text(25,1.15,"IQR",color='g')
plt.text(25,1.1,subset["Age"].quantile(0.50),color='g')
plt.text(35,1.15,"Q3",color='b')
plt.text(35,1.1,subset["Age"].quantile(0.75),color='b')

In [None]:
from numpy import std


sns.set(style="darkgrid")
df = subset
 
ax = sns.boxplot(x="Survived", y="Age", data = df)

# Calculate number of obs per group & median to position labels
medians = df.groupby(['Survived'])['Age'].median().values
nobs = df['Survived'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ["n: " + i for i in nobs]

# Add it to the plot
pos = range(len(nobs))
for tick,label in zip(pos,ax.get_xticklabels()):#El i-ésimo elemento en cada tupla 
    ax.text(pos[tick],
            medians[tick] + 0.03,
            nobs[tick],
            horizontalalignment='center',
            size='x-small',
            color='w',
            weight='semibold'
            )
plt.show()

# Matriz de correlación

In [None]:
def titanic_corr(data): #matriz de correlación
    correlation = data.corr()
    sns.heatmap(correlation, annot=True, cbar=True, cmap="RdYlGn")

titanic_corr(subset)