In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

titanic_data = pd.read_csv('train.csv', index_col = 'PassengerId')

SURVIVED_COL = 'Survived'
NAME_COL = 'Name'
SEX_COL = 'Sex'
CLASS_COL = 'Pclass'
AGE_COL = 'Age'
SIBSP_COL= 'SibSp'
PARCH_COL = 'Parch'
TICKET_COL ='Ticket'
PRICE_COL = 'Fare'
CABIN_COL = 'Cabin'
PORT_COL = 'Embarked'
STATUS_COL = 'Status'

column_names = titanic_data.columns
DATA_LEN = len(titanic_data)

# Context

In [None]:
titanic_data.head(5)

## Columns meaning
* Name: Name of passenger
* Survival: is or not survived(0 = No, 1 = Yes)
* Pclass: Ticket class(1 = 1st, 2 = 2nd, 3 = 3rd)
* Sex: Sex
* Age: Age in years
* Sibsp: count of siblings / spouses aboard the Titanic (brother, sister, stepbrother, stepsister; husband, wife (mistresses and fianc√©s were ignored))
* Parch: count of parents / children aboard the Titanic (Some children travelled only with a nanny, therefore parch=0 for them)
* Ticket: Ticket number
* Fare: Ticket Price
* Cabin: Cabin number
* Embarked: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [None]:
titanic_data.info()

In [None]:
titanic_data[SURVIVED_COL].value_counts().plot.pie()

In [None]:
titanic_data[SEX_COL].value_counts().plot.pie()

In [None]:
titanic_data[AGE_COL].plot(kind='hist', xlabel='Age').set_title('Age')

In [None]:
titanic_data[PRICE_COL].plot(kind='hist').set_title('Ticket price')

In [None]:
titanic_data[PORT_COL].value_counts().plot.pie()

In [None]:
titanic_data[CLASS_COL].value_counts().plot.pie()

In [None]:
titanic_data[SIBSP_COL].value_counts().sort_index().plot(kind='bar').set_title('Siblings/Sproud abroad')

In [None]:
titanic_data[PARCH_COL].value_counts().sort_index().plot(kind='bar').set_title('Parents/Childs abroad')

Passanger names include the person status, that could be usefull for data analysis:

In [None]:
names_seria = titanic_data[NAME_COL].copy()
for i in range(len(titanic_data)):
    name = names_seria.iloc[i]    
    names_seria.iloc[i] = name[name.find(',')+1:].split()[0]
titanic_data['Status'] = names_seria
index_of_the = titanic_data[titanic_data[STATUS_COL]=='the'].index[0]-1 #replacing the exception situation with the Countess.
titanic_data.iat[index_of_the, 11] = 'Countess.'

titanic_data[STATUS_COL].value_counts().plot(kind='bar').set_title('Status')

On the moment of Titanic crash(11:40 pm), a lot of passengers should be in the cabins, that conditions the value of cabin information: 
https://habrastorage.org/r/w1560/files/ddf/307/0c6/ddf3070c69e0450184e35790f72917d1.jpeg
The higher cabin should mean higher chanse of Survive.

# Data quality assessment

In [None]:
titanic_data[titanic_data[NAME_COL].duplicated()]

No duplicated found

In [None]:
print('Columns with Nans:')
for c in range(len(column_names)):
    if titanic_data[column_names[c]].hasnans:
        print (column_names[c])
        print (str(len(titanic_data[titanic_data[column_names[c]].isna()]) / DATA_LEN *100) + '% of Nans')

Age gaps need to be replaced with the mean of person status

In [None]:
list_of_statuses = titanic_data[STATUS_COL].unique()
for i in range(len(list_of_statuses)):
    status = list_of_statuses[i]
    mean = titanic_data[titanic_data[STATUS_COL] == status][AGE_COL].dropna().mean()
    mean = round(mean*2)/2
    titanic_data[titanic_data[STATUS_COL] == status] = titanic_data[titanic_data[STATUS_COL] == status].fillna({AGE_COL:mean})
    
print('Age column has Nans:' + str(titanic_data[AGE_COL].hasnans))

Because of possible great importance of cabin information, we don't drop this column, though data is 77% missing.

In [None]:
titanic_data[CABIN_COL].unique()

In cabin T was living only one man, according to info from Encyclopedia Titanic, so it can be deleted.
For others change we value just to the first letter to make the data easy to analyse. And fill the Nans.

In [None]:
titanic_data.at[titanic_data[CABIN_COL].notna(),CABIN_COL] = titanic_data[titanic_data[CABIN_COL].notna()][CABIN_COL].apply(lambda x : x[0:1])
titanic_data[CABIN_COL] = titanic_data[CABIN_COL].fillna('NO INFO')
titanic_data.at[titanic_data[CABIN_COL] == 'T',CABIN_COL] = 'NO INFO'
sorted(titanic_data[CABIN_COL].unique())

Embarked Port is missing only in two rows, so fill we just with the most popular - S

In [None]:
titanic_data[PORT_COL] = titanic_data[PORT_COL].fillna('S')

There are ticket prices equal to 0, that must be an error. Fill we zeros with the mean of class price. 

In [None]:
for cl in range(3):
    mask = titanic_data[CLASS_COL]==(cl+1)
    mean_price = titanic_data.loc[mask][PRICE_COL].mean()
    mask = mask & (titanic_data[PRICE_COL]==0)
    titanic_data.loc[mask,PRICE_COL] = mean_price

Now the plot look like this:

In [None]:
titanic_data[PRICE_COL].plot(kind='hist').set_title('Ticket price')

# Data exploration

In [None]:
#create a function that show the distribution of survived and not survived by the feature
def bar_survive(feature):
    survived = titanic_data[titanic_data[SURVIVED_COL]==1][feature].value_counts()
    dead = titanic_data[titanic_data[SURVIVED_COL]==0][feature].value_counts()
    if ("NO INFO" in survived.index):
        survived = survived.drop(index='NO INFO')
    if ("NO INFO" in dead.index):
        dead = dead.drop(index='NO INFO')
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind ='bar',stacked='True')

In [None]:
#the same with class
def bar_class(feature):
    first = titanic_data[titanic_data[CLASS_COL]==1][feature].value_counts()
    second = titanic_data[titanic_data[CLASS_COL]==2][feature].value_counts()
    third = titanic_data[titanic_data[CLASS_COL]==3][feature].value_counts()
    df = pd.DataFrame([first, second,third])
    df.index = ['First','Second','Third']
    df.plot(kind ='bar',stacked='True')

In [None]:
sns.heatmap(titanic_data.corr())

The first strong correlation with survive chance is PClass.

In [None]:
bar_survive(CLASS_COL)

In [None]:
bar_survive(CABIN_COL)

We can also see, that the fact of existing of cabin information increases the survive chance.
But this fact correlates with PClass:

In [None]:
bar_class(CABIN_COL)

In [None]:
bar_survive(SEX_COL)

In [None]:
bar_survive(PORT_COL)

In [None]:
sns.scatterplot(data= titanic_data,x='PassengerId', y=PRICE_COL, hue=titanic_data[SURVIVED_COL], style='Survived')

### Summary

The most important factors of survive are:
* female survived more than male
* 3rd class increases chance of dead, 1st - of survive
* fare