# Analisis preeliminar

In [1]:
import pandas as pd
import pathlib

path = pathlib.Path("../datasets/titanic/titanic3.csv")
df = pd.read_csv(path)

df.head() # leemos los primeros 5 registros

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
# Dimension del dataset
df.shape # Forma Numero de filas y columnas


(1309, 14)

In [7]:
df.columns.values # Nombres de las columnas

array(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype=object)

In [8]:
# Estadisticas basicas para las variables numericas
# Count: N° de registros no nulos
# Mean: Promedio
# Std: Desviacion estandar
# Min: Valor minimo
# 25%: Primer cuartil
# 50%: Segundo cuartil, mediana
# 75%: Tercer cuartil
# Max: Valor maximo
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [9]:
# A partir de los estadisticos se puede inferir que 
# No se encontraron todos los cuerpos de los pasajeros

# La edad media de los pasajeros es de 29 años
# La edad minima es de 0.17 años
# La edad maxima es de 80 años
# La mitad de los pasajeros tenian menos de 28 años
# El 75% de los pasajeros tenian menos de 39 años
# Mas de la mitad de los pasajeros murieron
# El 75% de los pasajeros pagaron menos de 31 dolares por el pasaje
# El 25% de los pasajeros pagaron menos de 8 dolares por el pasaje
# Mas del 75% de los pasajeros viajaban en tercera clase
# Mas del 50% de los pasajeros viajaban sin hermanos o conyuges
# Mas del 75% de los pasajeros viajaban sin padres o hijos

In [11]:
df.dtypes # Tipos de datos de las columnas

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [12]:
# Imputacion de datos faltantes
# Fill na con 0
df2 = df.fillna(0)

In [18]:
# Sustutir por el promedio
df3 = df

In [20]:
df3.age = df3.age.fillna(df3.age.mean())
df3.body = df3.body.fillna(df3.body.mean())

In [21]:
df3

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.000000,0,0,24160,211.3375,B5,S,2,160.809917,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.916700,1,2,113781,151.5500,C22 C26,S,11,160.809917,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.000000,1,2,113781,151.5500,C22 C26,S,,160.809917,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.000000,1,2,113781,151.5500,C22 C26,S,,135.000000,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.000000,1,2,113781,151.5500,C22 C26,S,,160.809917,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.500000,1,0,2665,14.4542,,C,,328.000000,
1305,3,0,"Zabour, Miss. Thamine",female,29.881135,1,0,2665,14.4542,,C,,160.809917,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.500000,0,0,2656,7.2250,,C,,304.000000,
1307,3,0,"Zakarian, Mr. Ortin",male,27.000000,0,0,2670,7.2250,,C,,160.809917,


In [30]:
# Dummy variables for sex
df4 = df
df4['sex_male'] = (df4['sex'] == "male").astype(int)
df4['sex_female'] = (df4['sex'] == "female").astype(int)
df4.drop(["sex"] , axis=1, inplace=True)


Unnamed: 0,pclass,survived,name,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,sex_male,sex_female
0,1,1,"Allen, Miss. Elisabeth Walton",29.000000,0,0,24160,211.3375,B5,S,2,160.809917,"St Louis, MO",0,1
1,1,1,"Allison, Master. Hudson Trevor",0.916700,1,2,113781,151.5500,C22 C26,S,11,160.809917,"Montreal, PQ / Chesterville, ON",1,0
2,1,0,"Allison, Miss. Helen Loraine",2.000000,1,2,113781,151.5500,C22 C26,S,,160.809917,"Montreal, PQ / Chesterville, ON",0,1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",30.000000,1,2,113781,151.5500,C22 C26,S,,135.000000,"Montreal, PQ / Chesterville, ON",1,0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.000000,1,2,113781,151.5500,C22 C26,S,,160.809917,"Montreal, PQ / Chesterville, ON",0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",14.500000,1,0,2665,14.4542,,C,,328.000000,,0,1
1305,3,0,"Zabour, Miss. Thamine",29.881135,1,0,2665,14.4542,,C,,160.809917,,0,1
1306,3,0,"Zakarian, Mr. Mapriededer",26.500000,0,0,2656,7.2250,,C,,304.000000,,1,0
1307,3,0,"Zakarian, Mr. Ortin",27.000000,0,0,2670,7.2250,,C,,160.809917,,1,0
