In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn import neighbors
from sklearn import svm
from sklearn.metrics import mean_absolute_error, confusion_matrix
import re
import numpy as np

# Load the data set
df = pd.read_csv("titanic.csv")
df.describe()
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [None]:
## Missing data
total = df.isnull().sum().sort_values(ascending=False) # total number of values - else will count only NA values
percentageNull = ((df.isnull().sum()/ df.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percentageNull], axis=1, keys=['Total', 'MissingValues %'])
print(missing_data.head(10))

# Columns
print('Total number of columns are %d'%df.columns.values.size)
print(df.columns.values)

# Removing unnecessary columns
titanic_df = df.drop(['name'], axis=1) # Name is unnecessary
#print(titanic_df.head(10))

# Extracting the Deck out of the Cabin
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
titanic_df['cabin'] = titanic_df['cabin'].fillna('U0')
titanic_df['deck'] = titanic_df['cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
titanic_df['deck'] = titanic_df['deck'].map(deck)
titanic_df['deck'] = titanic_df['deck'].fillna(0)
titanic_df['deck'] = titanic_df['deck'].astype(int)
titanic_df = titanic_df.drop(['cabin'], axis=1)
#print(titanic_df.head(10))


# Replacing age with the mean
meanAge = titanic_df['age'].mean()
print('the mean age is %d'%meanAge)
ageCopy = titanic_df['age'].copy().replace(np.nan, meanAge, regex=True).apply(np.ceil).astype(int)
titanic_df['age'] = ageCopy

# Survived
titanic_df['survived'] = titanic_df['survived'].astype(int)

# Fare
titanic_df['fare'] = titanic_df['fare'].fillna(0).astype(int)


# categorical data
#print(titanic_df.info()) # sex, ticket, cabin, embarked, boat
#print(titanic_df.head(10))

#sex
genders = {'male':'M', 'female':'F'}
print(titanic_df['sex'].describe())
titanic_df['sex'] = titanic_df['sex'].map(genders)


# ticket - cannot convert to categories , too many unique tickets
titanic_df = titanic_df.drop(['ticket'], axis=1)


# Embarked
print(titanic_df['embarked'].describe())
titanic_df['embarked'] = titanic_df['embarked'].fillna('S') # Must fill dummies


#Boat
boat_null = titanic_df[pd.isna(titanic_df['boat'])]
print(boat_null['survived'].sum()) # Only 23 who did not get on a boat survived
titanic_df[['boat']] = np.where(titanic_df[['boat']].isnull(), 0, 1)

# Body - Too many missing values for body

In [4]:
titanic_df.head(5)


## Y = survived
## x = Features 
## No age - replace by average

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked,boat,body,home.dest,deck
0,1,1,F,29,0,0,211,S,1,,"St Louis, MO",2
1,1,1,M,1,1,2,151,S,1,,"Montreal, PQ / Chesterville, ON",3
2,1,0,F,2,1,2,151,S,0,,"Montreal, PQ / Chesterville, ON",3
3,1,0,M,30,1,2,151,S,0,135.0,"Montreal, PQ / Chesterville, ON",3
4,1,0,F,25,1,2,151,S,0,,"Montreal, PQ / Chesterville, ON",3
