# Titanic: 
## Utilizar ML para crear un modelo que prediga que pasajeros sobreviven al hundimiento del Titanic a partir de cierta información de los pasajeros

https://www.kaggle.com/competitions/titanic/

| Variable	| Definition |	Key |
| --- | --- | --- |
| Survived | Survival|	0 = No, 1 = Yes |
| Pclass | Ticket class |	1 = 1st, 2 = 2nd, 3 = 3rd |
| Name | Passenger name |
| Sex	| Sex |	male, female |
| Age	| Age in years	|
| SibSp	| # of siblings / spouses aboard the Titanic |	
| Parch	| # of parents / children aboard the Titanic |	
| Ticket|	Ticket number	|
| Fare	| Passenger fare |	
| Cabin	| Cabin number |	
| Embarked |	Port of Embarkation	| C = Cherbourg, Q = Queenstown, S = Southampton |

In [None]:
### Bibliotecas de python
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
#from collections import Counter
import re #Regular expression operations
import seaborn as sn #statistical data visualization


In [None]:

###Leer datos (localmente)
mainpath = "../datasets/"
filename = "titanic-kaggle/train.csv"
fullpath = os.path.join(mainpath, filename)
data = pd.read_csv (fullpath)
# Dimension de los datos(filas,columnas,)
data.shape 

In [None]:
# Cabeceras de las columnas
data.columns.values 

In [None]:
# Vista general de los datos
data.head()

In [None]:
data.dtypes

In [None]:
# Valortes nulos
#null_columns=data.columns[data.isnull().any()]
data.isnull().sum()

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(3,4,1)
plt.title("Survived")
plt.hist(data.Survived,bins=20)
plt.subplot(3,4,2)
plt.title("Class")
plt.hist(data.Pclass,bins=20)
plt.subplot(3,4,3)
plt.title("Sex")
plt.hist(data.Sex,bins=20)
plt.subplot(3,4,4)
plt.title("Age")
plt.hist(data.Age,bins=20)
plt.subplot(3,4,5)
plt.title("Sibilings")
plt.hist(data.SibSp,bins=20)
plt.subplot(3,4,6)
plt.title("Parents")
plt.hist(data.Parch,bins=20)
plt.subplot(3,4,7)
plt.title("Fare")
plt.hist(data["Fare"],bins=20)
plt.subplot(3,4,8)
plt.title("Embarked")
data.Embarked.value_counts().plot(kind='bar')


In [None]:
data2 = data["Cabin"]
data3 = data2.dropna(axis=0,how="any") #borra todas las filas que que tengan al menos un NaN
data4 = data3.loc[data3.str.contains("A")] #
data4

In [None]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

titles = data["Name"].apply(get_title)
print(pd.value_counts(titles))

In [None]:
# Estadistica basica de variables númericas 
data.describe() 

In [None]:
#Crear un subset
subset = data[["PassengerId","Sex","Age","Pclass","Fare","Embarked","Survived"]]
subset_clean = subset.dropna(axis=0,how="any") #borra todas las filas que que tengan al menos un NaN
subset_clean

In [None]:
subset_clean.describe() # solo valores numéricos

In [None]:
subset_clean.Embarked.value_counts()

In [None]:
subset_clean.Sex.value_counts()

In [None]:
#subset_clean["Age"].mean() #1. Media aritmética
#subset_clean["Age"].mode() #2. Moda
#subset_clean["Age"].median() # 3. Mediana
#subset_clean["Age"].var() # 5. Varianza
#subset_clean["Age"].std() # 6. Desviación estándar
subset_clean["Age"].quantile([.25,.5,.75]) #7. Cuantiles




In [None]:
subset1 = data[["Pclass","Fare"]]
cov_matrix = np.cov(subset1, bias=True)
print(cov_matrix)
sn.heatmap(cov_matrix, annot=True, fmt='g')
plt.show()

In [None]:
datos1 = data["Survived"]
live = data[(data["Survived"]==1)]
dead = data[(data["Survived"]==0)]
print("De los",data.shape[0], "pasajeros del titanic","sobrevivieron",live.shape[0],"y murieron",dead.shape[0])
plt.hist(datos1,bins=20) # bins=10 default
plt.title("Sobrevivieron?")
plt.xlabel("si =1, no = 0")
plt.ylabel("Frecuencia")

In [None]:
"""
Categorización de los datos en hombre/mujeres vivos/muertos
"""

livem = data[(data["Survived"]==1) & (data["Sex"] == "male")]
livef = data[(data["Survived"]==1) & (data["Sex"] == "female")]
deadm = data[(data["Survived"]==0) & (data["Sex"] == "male")]
deadf = data[(data["Survived"]==0) & (data["Sex"] == "female")]
print("De los",live.shape[0], "pasajeros que sobrevivieron",livem.shape[0],"eran hombres y",livef.shape[0],"eran mujeres")
print("De los",dead.shape[0], "pasajeros que murieron",deadm.shape[0],"eran hombres y",deadf.shape[0],"eran mujeres")

In [None]:
plt.subplot(1,2,1)
plt.title("Sobrevivieron? (Hombres)")
plt.hist(livem["Survived"],bins=20)
plt.hist(deadm["Survived"],bins=20)
plt.xlabel("si =1, no = 0")
#plt.text(1,400,livem.shape[0])
#plt.text(1,300,deadm.shape[0])

plt.subplot(1,2,2)
plt.title("Sobrevivieron? (Mujeres)")
plt.hist(livef["Survived"],bins=20)
plt.hist(deadf["Survived"],bins=20)
plt.xlabel("si =1, no = 0")


In [None]:
plt.hist(data["Age"],bins=20)
plt.hist(livem["Age"],bins=20)
plt.hist(deadm["Age"],bins=20)